156 lines
5.9 KiB
Python
156 lines
5.9 KiB
Python
import re
|
|
import json
|
|
from tqdm import tqdm
|
|
|
|
# Specify fields to extract
|
|
selected_fields = ['content']
|
|
|
|
# Mapping from original field names to desired keys
|
|
mapping = {
|
|
'content': 'content',
|
|
}
|
|
|
|
# Function to extract fields from CREATE TABLE
|
|
def extract_fields(create_table_sql):
|
|
# Find the fields section
|
|
match = re.search(r'CREATE TABLE.*?\((.*)\).*?;', create_table_sql, re.DOTALL)
|
|
if not match:
|
|
return []
|
|
|
|
fields_section = match.group(1)
|
|
print("Fields section:", repr(fields_section))
|
|
# Split by comma, but handle nested parentheses
|
|
fields = []
|
|
current_field = ""
|
|
paren_count = 0
|
|
for char in fields_section:
|
|
if char == '(':
|
|
paren_count += 1
|
|
elif char == ')':
|
|
paren_count -= 1
|
|
elif char == ',' and paren_count == 0:
|
|
fields.append(current_field.strip())
|
|
current_field = ""
|
|
continue
|
|
current_field += char
|
|
if current_field.strip():
|
|
fields.append(current_field.strip())
|
|
|
|
print("Fields:", fields)
|
|
|
|
# Extract field names
|
|
field_names = []
|
|
for field in fields:
|
|
field = field.strip()
|
|
if field.startswith('`') and not field.upper().startswith(('PRIMARY', 'UNIQUE', 'KEY', 'INDEX')):
|
|
name = field.split('`')[1]
|
|
field_names.append(name)
|
|
return field_names
|
|
|
|
# Function to extract data from INSERT statements
|
|
def extract_data(insert_sql, field_names):
|
|
# Find all INSERT VALUES
|
|
print("Insert sql:", repr(insert_sql[:200]))
|
|
inserts = re.findall(r'INSERT.*?INTO.*?VALUES\s*\((.*?)\);', insert_sql, re.DOTALL)
|
|
print("Inserts found:", len(inserts))
|
|
for i, ins in enumerate(inserts):
|
|
print(f"Insert {i}:", repr(ins[:100]))
|
|
data = []
|
|
for insert in inserts:
|
|
# Split values by comma, but handle strings
|
|
values = []
|
|
current_value = ""
|
|
in_string = False
|
|
for char in insert:
|
|
if char == "'" and not in_string:
|
|
in_string = True
|
|
elif char == "'" and in_string:
|
|
in_string = False
|
|
elif char == ',' and not in_string:
|
|
values.append(current_value.strip())
|
|
current_value = ""
|
|
continue
|
|
current_value += char
|
|
if current_value.strip():
|
|
values.append(current_value.strip())
|
|
|
|
# Clean values (remove quotes)
|
|
cleaned_values = []
|
|
for val in values:
|
|
val = val.strip()
|
|
if val.startswith("'") and val.endswith("'"):
|
|
val = val[1:-1]
|
|
cleaned_values.append(val)
|
|
|
|
# Get indices for selected fields
|
|
selected_indices = [field_names.index(f) for f in selected_fields]
|
|
|
|
# Create dict with mapped keys
|
|
row = {}
|
|
for f, i in zip(selected_fields, selected_indices):
|
|
key = mapping.get(f, f)
|
|
row[key] = cleaned_values[i]
|
|
data.append(row)
|
|
return data
|
|
|
|
# Main logic
|
|
if __name__ == "__main__":
|
|
print("Calculating total lines...")
|
|
with open('maya_business_bill.sql', 'r') as f:
|
|
total_lines = sum(1 for _ in f)
|
|
print(f"Total lines: {total_lines}")
|
|
|
|
with open('maya_business_bill.sql', 'r') as f:
|
|
create_table_sql = ''
|
|
in_create = False
|
|
field_names = []
|
|
selected_indices = []
|
|
with open('output.jsonl', 'w') as out_f:
|
|
for line in tqdm(f, total=total_lines, desc="Processing SQL"):
|
|
if line.strip().startswith('CREATE TABLE'):
|
|
in_create = True
|
|
if in_create:
|
|
create_table_sql += line
|
|
if line.strip().endswith(';'):
|
|
in_create = False
|
|
field_names = extract_fields(create_table_sql)
|
|
print("Extracted fields:", field_names)
|
|
selected_indices = [field_names.index(f) for f in selected_fields]
|
|
elif line.strip().startswith('INSERT INTO'):
|
|
# Process INSERT
|
|
match = re.search(r'INSERT.*?VALUES\s*\((.*?)\);', line.strip())
|
|
if match:
|
|
insert = match.group(1)
|
|
# Parse values
|
|
values = []
|
|
current_value = ""
|
|
in_string = False
|
|
for char in insert:
|
|
if char == "'" and not in_string:
|
|
in_string = True
|
|
elif char == "'" and in_string:
|
|
in_string = False
|
|
elif char == ',' and not in_string:
|
|
values.append(current_value.strip())
|
|
current_value = ""
|
|
continue
|
|
current_value += char
|
|
if current_value.strip():
|
|
values.append(current_value.strip())
|
|
|
|
# Clean values
|
|
cleaned_values = []
|
|
for val in values:
|
|
val = val.strip()
|
|
if val.startswith("'") and val.endswith("'"):
|
|
val = val[1:-1]
|
|
cleaned_values.append(val)
|
|
|
|
# Create row
|
|
row = {}
|
|
for f, i in zip(selected_fields, selected_indices):
|
|
key = mapping.get(f, f)
|
|
row[key] = cleaned_values[i]
|
|
out_f.write(json.dumps(row) + '\n')
|
|
|
|
print("JSONL output saved to output.jsonl") |