diff --git a/sql_ml.py b/sql_ml.py index b37ce18..067833e 100644 --- a/sql_ml.py +++ b/sql_ml.py @@ -372,14 +372,22 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000 final_templates[tpl].append(content) # 注册元信息(占位符与其组正则) if tpl not in templates_meta: - phs = [] - for name in ar.get('placeholders', []): - # 优先把 source_regex 作为权威,不在此处重复具体子模式(用 None 标记) - phs.append({'name': name, 'pattern': None}) - templates_meta[tpl] = { - 'placeholders': phs, - 'source_regex': ar['source_regex'] - } + # 尝试从 source_regex 中提取顶层捕获组模式并按占位符顺序配对 + try: + group_patterns = extract_group_patterns_from_regex(ar['source_regex']) + except Exception: + group_patterns = [] + + phs = [] + for idx, name in enumerate(ar.get('placeholders', [])): + # 如果成功提取到对应的捕获组模式,则使用之,否则回退到通配 '(.+?)' + pat = group_patterns[idx] if idx < len(group_patterns) and group_patterns[idx] else '(.+?)' + phs.append({'name': name, 'pattern': pat}) + + templates_meta[tpl] = { + 'placeholders': phs, + 'source_regex': ar['source_regex'] + } matched_by_rule = True break except re.error: @@ -432,6 +440,8 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000 print("\n--- 第 3 部分: 合并结果并保存 ---") print(f"总共找到 {len(final_templates)} 个唯一的模板。") + # 收集所有出现的占位符名称,写入单独文件以便查看 + all_placeholders = set() with open(output_file, 'w', encoding='utf-8') as f: for template, data_list in sorted(final_templates.items()): # 新格式:{"template":..., "regex":{name: pattern, ...}, "data":[...]} @@ -462,9 +472,25 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000 json.dump(output_obj, f, ensure_ascii=False) f.write('\n') + # 收集占位符(以 形式) + for name in regex_map.keys(): + all_placeholders.add(f'<{name}>') + # 如果模板字符串本身包含未出现在 regex_map 的占位符,也一并收集 + for name in re.findall(r'<([^>]+)>', template): + all_placeholders.add(f'<{name}>') print(f"所有模板已成功写入到 '{output_file}'。") + # 输出占位符清单到文件 + placeholders_file = 'placeholders_list.txt' + try: + with open(placeholders_file, 'w', encoding='utf-8') as pf: + for ph in sorted(all_placeholders): + pf.write(ph + '\n') + print(f"所有占位符已写入到 '{placeholders_file}'(共 {len(all_placeholders)} 个)。") + except Exception: + print("警告:写入占位符清单失败。") + except FileNotFoundError as e: print(f"错误:找不到输入文件 {e.filename}。") return @@ -614,7 +640,7 @@ def extract_values_with_templates(input_files, template_file, output_file, conte # --- 使用示例 --- # 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl' input_jsonl_files = ['content_filtered.jsonl', 'output.jsonl'] # 默认单个文件,可扩展为多个 -output_template_file = 'templates_iterative.txt' +output_template_file = 'templates_iterative.jsonl' BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整 if __name__ == "__main__": @@ -633,7 +659,7 @@ if __name__ == "__main__": # 执行参数提取 extract_values_with_templates( input_files=args.input_file, - template_file='templates_iterative.txt', + template_file='templates_iterative.jsonl', output_file=args.output_file, content_key=args.content_key )