import re import json from tqdm import tqdm # Specify fields to extract selected_fields = ['content'] # Mapping from original field names to desired keys mapping = { 'content': 'content', } # Function to extract fields from CREATE TABLE def extract_fields(create_table_sql): # Find the fields section match = re.search(r'CREATE TABLE.*?\((.*)\).*?;', create_table_sql, re.DOTALL) if not match: return [] fields_section = match.group(1) print("Fields section:", repr(fields_section)) # Split by comma, but handle nested parentheses fields = [] current_field = "" paren_count = 0 for char in fields_section: if char == '(': paren_count += 1 elif char == ')': paren_count -= 1 elif char == ',' and paren_count == 0: fields.append(current_field.strip()) current_field = "" continue current_field += char if current_field.strip(): fields.append(current_field.strip()) print("Fields:", fields) # Extract field names field_names = [] for field in fields: field = field.strip() if field.startswith('`') and not field.upper().startswith(('PRIMARY', 'UNIQUE', 'KEY', 'INDEX')): name = field.split('`')[1] field_names.append(name) return field_names # Function to extract data from INSERT statements def extract_data(insert_sql, field_names): # Find all INSERT VALUES print("Insert sql:", repr(insert_sql[:200])) inserts = re.findall(r'INSERT.*?INTO.*?VALUES\s*\((.*?)\);', insert_sql, re.DOTALL) print("Inserts found:", len(inserts)) for i, ins in enumerate(inserts): print(f"Insert {i}:", repr(ins[:100])) data = [] for insert in inserts: # Split values by comma, but handle strings values = [] current_value = "" in_string = False for char in insert: if char == "'" and not in_string: in_string = True elif char == "'" and in_string: in_string = False elif char == ',' and not in_string: values.append(current_value.strip()) current_value = "" continue current_value += char if current_value.strip(): values.append(current_value.strip()) # Clean values (remove quotes) cleaned_values = [] for val in values: val = val.strip() if val.startswith("'") and val.endswith("'"): val = val[1:-1] cleaned_values.append(val) # Get indices for selected fields selected_indices = [field_names.index(f) for f in selected_fields] # Create dict with mapped keys row = {} for f, i in zip(selected_fields, selected_indices): key = mapping.get(f, f) row[key] = cleaned_values[i] data.append(row) return data # Main logic if __name__ == "__main__": print("Calculating total lines...") with open('maya_business_bill.sql', 'r') as f: total_lines = sum(1 for _ in f) print(f"Total lines: {total_lines}") with open('maya_business_bill.sql', 'r') as f: create_table_sql = '' in_create = False field_names = [] selected_indices = [] with open('output.jsonl', 'w') as out_f: for line in tqdm(f, total=total_lines, desc="Processing SQL"): if line.strip().startswith('CREATE TABLE'): in_create = True if in_create: create_table_sql += line if line.strip().endswith(';'): in_create = False field_names = extract_fields(create_table_sql) print("Extracted fields:", field_names) selected_indices = [field_names.index(f) for f in selected_fields] elif line.strip().startswith('INSERT INTO'): # Process INSERT match = re.search(r'INSERT.*?VALUES\s*\((.*?)\);', line.strip()) if match: insert = match.group(1) # Parse values values = [] current_value = "" in_string = False for char in insert: if char == "'" and not in_string: in_string = True elif char == "'" and in_string: in_string = False elif char == ',' and not in_string: values.append(current_value.strip()) current_value = "" continue current_value += char if current_value.strip(): values.append(current_value.strip()) # Clean values cleaned_values = [] for val in values: val = val.strip() if val.startswith("'") and val.endswith("'"): val = val[1:-1] cleaned_values.append(val) # Create row row = {} for f, i in zip(selected_fields, selected_indices): key = mapping.get(f, f) row[key] = cleaned_values[i] out_f.write(json.dumps(row) + '\n') print("JSONL output saved to output.jsonl")