ml_data_template/sql2json.py

156 lines
5.9 KiB
Python
Raw Normal View History

2025-09-09 15:10:49 +08:00
import re
import json
from tqdm import tqdm
# Specify fields to extract
selected_fields = ['content']
# Mapping from original field names to desired keys
mapping = {
'content': 'content',
}
# Function to extract fields from CREATE TABLE
def extract_fields(create_table_sql):
# Find the fields section
match = re.search(r'CREATE TABLE.*?\((.*)\).*?;', create_table_sql, re.DOTALL)
if not match:
return []
fields_section = match.group(1)
print("Fields section:", repr(fields_section))
# Split by comma, but handle nested parentheses
fields = []
current_field = ""
paren_count = 0
for char in fields_section:
if char == '(':
paren_count += 1
elif char == ')':
paren_count -= 1
elif char == ',' and paren_count == 0:
fields.append(current_field.strip())
current_field = ""
continue
current_field += char
if current_field.strip():
fields.append(current_field.strip())
print("Fields:", fields)
# Extract field names
field_names = []
for field in fields:
field = field.strip()
if field.startswith('`') and not field.upper().startswith(('PRIMARY', 'UNIQUE', 'KEY', 'INDEX')):
name = field.split('`')[1]
field_names.append(name)
return field_names
# Function to extract data from INSERT statements
def extract_data(insert_sql, field_names):
# Find all INSERT VALUES
print("Insert sql:", repr(insert_sql[:200]))
inserts = re.findall(r'INSERT.*?INTO.*?VALUES\s*\((.*?)\);', insert_sql, re.DOTALL)
print("Inserts found:", len(inserts))
for i, ins in enumerate(inserts):
print(f"Insert {i}:", repr(ins[:100]))
data = []
for insert in inserts:
# Split values by comma, but handle strings
values = []
current_value = ""
in_string = False
for char in insert:
if char == "'" and not in_string:
in_string = True
elif char == "'" and in_string:
in_string = False
elif char == ',' and not in_string:
values.append(current_value.strip())
current_value = ""
continue
current_value += char
if current_value.strip():
values.append(current_value.strip())
# Clean values (remove quotes)
cleaned_values = []
for val in values:
val = val.strip()
if val.startswith("'") and val.endswith("'"):
val = val[1:-1]
cleaned_values.append(val)
# Get indices for selected fields
selected_indices = [field_names.index(f) for f in selected_fields]
# Create dict with mapped keys
row = {}
for f, i in zip(selected_fields, selected_indices):
key = mapping.get(f, f)
row[key] = cleaned_values[i]
data.append(row)
return data
# Main logic
if __name__ == "__main__":
print("Calculating total lines...")
with open('maya_business_bill.sql', 'r') as f:
total_lines = sum(1 for _ in f)
print(f"Total lines: {total_lines}")
with open('maya_business_bill.sql', 'r') as f:
create_table_sql = ''
in_create = False
field_names = []
selected_indices = []
with open('output.jsonl', 'w') as out_f:
for line in tqdm(f, total=total_lines, desc="Processing SQL"):
if line.strip().startswith('CREATE TABLE'):
in_create = True
if in_create:
create_table_sql += line
if line.strip().endswith(';'):
in_create = False
field_names = extract_fields(create_table_sql)
print("Extracted fields:", field_names)
selected_indices = [field_names.index(f) for f in selected_fields]
elif line.strip().startswith('INSERT INTO'):
# Process INSERT
match = re.search(r'INSERT.*?VALUES\s*\((.*?)\);', line.strip())
if match:
insert = match.group(1)
# Parse values
values = []
current_value = ""
in_string = False
for char in insert:
if char == "'" and not in_string:
in_string = True
elif char == "'" and in_string:
in_string = False
elif char == ',' and not in_string:
values.append(current_value.strip())
current_value = ""
continue
current_value += char
if current_value.strip():
values.append(current_value.strip())
# Clean values
cleaned_values = []
for val in values:
val = val.strip()
if val.startswith("'") and val.endswith("'"):
val = val[1:-1]
cleaned_values.append(val)
# Create row
row = {}
for f, i in zip(selected_fields, selected_indices):
key = mapping.get(f, f)
row[key] = cleaned_values[i]
out_f.write(json.dumps(row) + '\n')
print("JSONL output saved to output.jsonl")