update
This commit is contained in:
parent
acaf55c728
commit
254e7e79e9
36
sql_ml.py
36
sql_ml.py
@ -372,10 +372,18 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
||||
final_templates[tpl].append(content)
|
||||
# 注册元信息(占位符与其组正则)
|
||||
if tpl not in templates_meta:
|
||||
# 尝试从 source_regex 中提取顶层捕获组模式并按占位符顺序配对
|
||||
try:
|
||||
group_patterns = extract_group_patterns_from_regex(ar['source_regex'])
|
||||
except Exception:
|
||||
group_patterns = []
|
||||
|
||||
phs = []
|
||||
for name in ar.get('placeholders', []):
|
||||
# 优先把 source_regex 作为权威,不在此处重复具体子模式(用 None 标记)
|
||||
phs.append({'name': name, 'pattern': None})
|
||||
for idx, name in enumerate(ar.get('placeholders', [])):
|
||||
# 如果成功提取到对应的捕获组模式,则使用之,否则回退到通配 '(.+?)'
|
||||
pat = group_patterns[idx] if idx < len(group_patterns) and group_patterns[idx] else '(.+?)'
|
||||
phs.append({'name': name, 'pattern': pat})
|
||||
|
||||
templates_meta[tpl] = {
|
||||
'placeholders': phs,
|
||||
'source_regex': ar['source_regex']
|
||||
@ -432,6 +440,8 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
||||
print("\n--- 第 3 部分: 合并结果并保存 ---")
|
||||
print(f"总共找到 {len(final_templates)} 个唯一的模板。")
|
||||
|
||||
# 收集所有出现的占位符名称,写入单独文件以便查看
|
||||
all_placeholders = set()
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
for template, data_list in sorted(final_templates.items()):
|
||||
# 新格式:{"template":..., "regex":{name: pattern, ...}, "data":[...]}
|
||||
@ -462,9 +472,25 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
||||
|
||||
json.dump(output_obj, f, ensure_ascii=False)
|
||||
f.write('\n')
|
||||
# 收集占位符(以 <name> 形式)
|
||||
for name in regex_map.keys():
|
||||
all_placeholders.add(f'<{name}>')
|
||||
# 如果模板字符串本身包含未出现在 regex_map 的占位符,也一并收集
|
||||
for name in re.findall(r'<([^>]+)>', template):
|
||||
all_placeholders.add(f'<{name}>')
|
||||
|
||||
print(f"所有模板已成功写入到 '{output_file}'。")
|
||||
|
||||
# 输出占位符清单到文件
|
||||
placeholders_file = 'placeholders_list.txt'
|
||||
try:
|
||||
with open(placeholders_file, 'w', encoding='utf-8') as pf:
|
||||
for ph in sorted(all_placeholders):
|
||||
pf.write(ph + '\n')
|
||||
print(f"所有占位符已写入到 '{placeholders_file}'(共 {len(all_placeholders)} 个)。")
|
||||
except Exception:
|
||||
print("警告:写入占位符清单失败。")
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"错误:找不到输入文件 {e.filename}。")
|
||||
return
|
||||
@ -614,7 +640,7 @@ def extract_values_with_templates(input_files, template_file, output_file, conte
|
||||
# --- 使用示例 ---
|
||||
# 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl'
|
||||
input_jsonl_files = ['content_filtered.jsonl', 'output.jsonl'] # 默认单个文件,可扩展为多个
|
||||
output_template_file = 'templates_iterative.txt'
|
||||
output_template_file = 'templates_iterative.jsonl'
|
||||
BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -633,7 +659,7 @@ if __name__ == "__main__":
|
||||
# 执行参数提取
|
||||
extract_values_with_templates(
|
||||
input_files=args.input_file,
|
||||
template_file='templates_iterative.txt',
|
||||
template_file='templates_iterative.jsonl',
|
||||
output_file=args.output_file,
|
||||
content_key=args.content_key
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user