This commit is contained in:
dev07 2025-09-09 15:10:49 +08:00
parent 6267458d07
commit 070ce91c53
3 changed files with 285 additions and 84 deletions

16
.gitignore vendored
View File

@ -4,4 +4,20 @@
*.txt
*.csv
*.log
*.pkl
*.pth
*.pyc
*.zip
*.ipynb
*.pyd
*.egg-info
*.egg
*.so
*.sql
*.db
*.db3
*.dll
*.pyo
__pycache__

156
sql2json.py Normal file
View File

@ -0,0 +1,156 @@
import re
import json
from tqdm import tqdm
# Specify fields to extract
selected_fields = ['content']
# Mapping from original field names to desired keys
mapping = {
'content': 'content',
}
# Function to extract fields from CREATE TABLE
def extract_fields(create_table_sql):
# Find the fields section
match = re.search(r'CREATE TABLE.*?\((.*)\).*?;', create_table_sql, re.DOTALL)
if not match:
return []
fields_section = match.group(1)
print("Fields section:", repr(fields_section))
# Split by comma, but handle nested parentheses
fields = []
current_field = ""
paren_count = 0
for char in fields_section:
if char == '(':
paren_count += 1
elif char == ')':
paren_count -= 1
elif char == ',' and paren_count == 0:
fields.append(current_field.strip())
current_field = ""
continue
current_field += char
if current_field.strip():
fields.append(current_field.strip())
print("Fields:", fields)
# Extract field names
field_names = []
for field in fields:
field = field.strip()
if field.startswith('`') and not field.upper().startswith(('PRIMARY', 'UNIQUE', 'KEY', 'INDEX')):
name = field.split('`')[1]
field_names.append(name)
return field_names
# Function to extract data from INSERT statements
def extract_data(insert_sql, field_names):
# Find all INSERT VALUES
print("Insert sql:", repr(insert_sql[:200]))
inserts = re.findall(r'INSERT.*?INTO.*?VALUES\s*\((.*?)\);', insert_sql, re.DOTALL)
print("Inserts found:", len(inserts))
for i, ins in enumerate(inserts):
print(f"Insert {i}:", repr(ins[:100]))
data = []
for insert in inserts:
# Split values by comma, but handle strings
values = []
current_value = ""
in_string = False
for char in insert:
if char == "'" and not in_string:
in_string = True
elif char == "'" and in_string:
in_string = False
elif char == ',' and not in_string:
values.append(current_value.strip())
current_value = ""
continue
current_value += char
if current_value.strip():
values.append(current_value.strip())
# Clean values (remove quotes)
cleaned_values = []
for val in values:
val = val.strip()
if val.startswith("'") and val.endswith("'"):
val = val[1:-1]
cleaned_values.append(val)
# Get indices for selected fields
selected_indices = [field_names.index(f) for f in selected_fields]
# Create dict with mapped keys
row = {}
for f, i in zip(selected_fields, selected_indices):
key = mapping.get(f, f)
row[key] = cleaned_values[i]
data.append(row)
return data
# Main logic
if __name__ == "__main__":
print("Calculating total lines...")
with open('maya_business_bill.sql', 'r') as f:
total_lines = sum(1 for _ in f)
print(f"Total lines: {total_lines}")
with open('maya_business_bill.sql', 'r') as f:
create_table_sql = ''
in_create = False
field_names = []
selected_indices = []
with open('output.jsonl', 'w') as out_f:
for line in tqdm(f, total=total_lines, desc="Processing SQL"):
if line.strip().startswith('CREATE TABLE'):
in_create = True
if in_create:
create_table_sql += line
if line.strip().endswith(';'):
in_create = False
field_names = extract_fields(create_table_sql)
print("Extracted fields:", field_names)
selected_indices = [field_names.index(f) for f in selected_fields]
elif line.strip().startswith('INSERT INTO'):
# Process INSERT
match = re.search(r'INSERT.*?VALUES\s*\((.*?)\);', line.strip())
if match:
insert = match.group(1)
# Parse values
values = []
current_value = ""
in_string = False
for char in insert:
if char == "'" and not in_string:
in_string = True
elif char == "'" and in_string:
in_string = False
elif char == ',' and not in_string:
values.append(current_value.strip())
current_value = ""
continue
current_value += char
if current_value.strip():
values.append(current_value.strip())
# Clean values
cleaned_values = []
for val in values:
val = val.strip()
if val.startswith("'") and val.endswith("'"):
val = val[1:-1]
cleaned_values.append(val)
# Create row
row = {}
for f, i in zip(selected_fields, selected_indices):
key = mapping.get(f, f)
row[key] = cleaned_values[i]
out_f.write(json.dumps(row) + '\n')
print("JSONL output saved to output.jsonl")

View File

@ -36,6 +36,19 @@ PLACEHOLDER_PATTERNS = {
'<交易类型>': r'(.+?)',
}
def get_placeholder(action):
"""
根据JSON消息的action类型返回对应的占位符
"""
if 'Received' in action:
return '付款人号码'
elif 'Sent' in action:
return '收款人号码'
elif 'Refunded' in action:
return '付款人号码'
else:
return '付款人号码' # 默认值
def normalize_text(text):
# 模式 8: 从银行收款 (这条规则必须先运行)
text = re.sub(
@ -153,6 +166,14 @@ def normalize_text(text):
text
)
# 新增规则统一处理所有JSON格式消息
# 匹配各种action类型Received money from, Sent money to, Received settlement from, Reversed settlement from等
text = re.sub(
r'\\\"(Received money from|Sent money to|Received settlement from|Reversed settlement from|Refunded money via|Sent money via)\\\",\\\"target\\\":\\\"(.+?)\\\"',
lambda m: f'\\\"{m.group(1)}\\\",\\\"target\\\":\\\"<{get_placeholder(m.group(1))}>\\\"',
text
)
return text
def template_to_regex(template):
@ -241,9 +262,10 @@ def run_dbscan_on_corpus(corpus, eps, min_samples, max_samples=10):
return {processed_corpus[i]: [corpus[i]][:max_samples] for i in range(len(corpus))}
def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, eps=0.4, min_samples=2, max_samples_per_template=0):
def extract_templates_iterative(input_files, output_file, rules, batch_size=1000, eps=0.4, min_samples=2, max_samples_per_template=0):
"""
使用小批量迭代的混合策略来提取模板并为每个模板收集最多10个原始数据集
支持多个输入文件
"""
print("--- 开始迭代式模板提取 ---")
final_templates = {} # template -> list of original contents
@ -251,10 +273,13 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
batch_num = 1
try:
print(f"步骤 1: 逐行处理 '{input_file}' 并动态构建模板库...")
print(f"步骤 1: 逐行处理输入文件 {input_files} 并动态构建模板库...")
total_lines = 0
for input_file in input_files:
with open(input_file, 'r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f)
total_lines += sum(1 for _ in f)
for input_file in input_files:
with open(input_file, 'r', encoding='utf-8') as f:
for line in tqdm(f, total=total_lines, desc="主进程"):
try:
@ -336,13 +361,14 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
print(f"所有模板已成功写入到 '{output_file}'")
except FileNotFoundError:
print(f"错误:找不到输入文件 '{input_file}'")
except FileNotFoundError as e:
print(f"错误:找不到输入文件 {e.filename}")
return
def extract_values_with_templates(input_file, template_file, output_file):
def extract_values_with_templates(input_files, template_file, output_file):
"""
使用DBSCAN生成的模板从原始消息中提取参数值
支持多个输入文件
"""
print("--- 开始使用模板提取参数值 ---")
@ -358,9 +384,12 @@ def extract_values_with_templates(input_file, template_file, output_file):
# 从原始数据中提取值
extracted_values = []
total_lines = 0
for input_file in input_files:
with open(input_file, 'r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f)
total_lines += sum(1 for _ in f)
for input_file in input_files:
with open(input_file, 'r', encoding='utf-8') as f:
for line in tqdm(f, total=total_lines, desc="提取参数"):
try:
@ -395,13 +424,13 @@ def extract_values_with_templates(input_file, template_file, output_file):
# --- 使用示例 ---
# 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl'
input_jsonl_file = 'content_filtered.jsonl'
input_jsonl_files = ['content_filtered.jsonl', 'output.jsonl'] # 默认单个文件,可扩展为多个
output_template_file = 'templates_iterative.txt'
BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract templates from GCash transaction data.')
parser.add_argument('--input_file', type=str, default=input_jsonl_file, help='Input JSONL file path')
parser.add_argument('--input_file', type=str, nargs='+', default=input_jsonl_files, help='Input JSONL file paths (multiple files supported)')
parser.add_argument('--output_file', type=str, default=output_template_file, help='Output template file path')
parser.add_argument('--batch_size', type=int, default=BATCH_PROCESSING_SIZE, help='Batch processing size (data volume)')
parser.add_argument('--eps', type=float, default=0.4, help='DBSCAN eps parameter')
@ -413,14 +442,14 @@ if __name__ == "__main__":
if args.extract_values:
# 执行参数提取
extract_values_with_templates(
input_file=args.input_file,
template_file=args.output_file,
output_file='extracted_parameters.jsonl'
input_files=args.input_file,
template_file='templates_iterative.txt',
output_file=args.output_file
)
else:
# 执行模板提取
extract_templates_iterative(
input_file=args.input_file,
input_files=args.input_file,
output_file=args.output_file,
rules=PREDEFINED_RULES,
batch_size=args.batch_size,