update

2025-09-09 15:56:47 +08:00 · 2025-09-09 15:56:47 +08:00 · be83924799
commit be83924799
parent 6b5b05de17
1 changed files with 196 additions and 13 deletions
--- a/sql_ml.py
+++ b/sql_ml.py
@ -17,6 +17,18 @@ PREDEFINED_RULES = [
    # {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')}
 ]

+# 抽象化规则：将正则、模板和占位符/每组的正则一并声明
+ABSTRACT_RULES = [
+    {
+        # 示例：把原始正则和占位符、每组的正则显式化
+        'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$',
+        'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
+        'placeholders': ['收款人名称', '银行4位数尾号'],
+        'group_patterns': ['(.+?)', '(\\d+)']
+    },
+    # 你可以在这里继续添加更多抽象规则
+]
+
 # 占位符到正则表达式的映射
 PLACEHOLDER_PATTERNS = {
    '<金额>': r'([\d,.]+)',
@ -191,6 +203,58 @@ def template_to_regex(template):
    
    return escaped_template

+
+def build_regex_from_template_data(template_data):
+    """
+    根据模板的元信息（placeholders / group_patterns / source_regex）构建一个用于匹配的正则表达式字符串。
+    优先使用 template_data['source_regex']（如果存在），否则从 template_data['content'] + template_data['placeholders'] 生成带命名分组的正则。
+    返回未编译的正则字符串。
+    """
+    # 如果有原始正则，直接使用（保持原样）
+    if 'source_regex' in template_data and template_data['source_regex']:
+        return template_data['source_regex']
+
+    template = template_data.get('content', '')
+    placeholders_meta = template_data.get('placeholders')
+
+    # 如果没有占位符元信息，退回到旧方法生成正则
+    if not placeholders_meta:
+        return template_to_regex(template)
+
+    # 转义模板的普通字符，保留占位符位置
+    escaped_template = re.escape(template)
+
+    def strip_outer_parens(pat: str) -> str:
+        pat = pat.strip()
+        if pat.startswith('(') and pat.endswith(')'):
+            return pat[1:-1]
+        return pat
+
+    # placeholders_meta 可能是 [{'name':..., 'pattern':...}, ...] 或 ['name1','name2']
+    for ph in placeholders_meta:
+        if isinstance(ph, dict):
+            name = ph.get('name')
+            pattern = ph.get('pattern', '.+?')
+        else:
+            # 旧格式：仅名字
+            name = ph
+            # 尝试从全局映射中找到默认pattern
+            pattern = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)')
+
+        # 清理 pattern 的外层括号以避免双重捕获
+        inner = strip_outer_parens(pattern)
+
+        # 命名捕获组 (保持组名为中文名，但 Python 的 (?P<name>) 要求组名为字母数字和下划线)
+        # 将占位符名中的非字母数字替换为下划线以构造合法的组名
+        safe_name = re.sub(r"[^0-9A-Za-z_]", "_", name)
+
+        named_group = f"(?P<{safe_name}>{inner})"
+
+        escaped_placeholder = re.escape(f'<{name}>')
+        escaped_template = escaped_template.replace(escaped_placeholder, named_group)
+
+    return escaped_template
+
 def extract_parameters(template, message):
    """
    从消息中提取参数值
@ -269,6 +333,8 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
    """
    print("--- 开始迭代式模板提取 ---")
    final_templates = {}  # template -> list of original contents
+    # 存储模板的元信息（占位符与其正则、原始正则等）
+    templates_meta = {}  # template -> { 'placeholders': [ {name, pattern}, ... ], 'source_regex': str }
    unmatched_batch = []
    batch_num = 1

@ -310,6 +376,30 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
                                    final_templates[normalized_content].append(content)
                                matched_by_rule = True
                                break
+                        # 额外：检查抽象化规则（source_regex + template + placeholders）
+                        if not matched_by_rule:
+                            for ar in ABSTRACT_RULES:
+                                try:
+                                    if re.match(ar['source_regex'], content):
+                                        tpl = ar['template']
+                                        if tpl not in final_templates:
+                                            final_templates[tpl] = []
+                                        if len(final_templates[tpl]) < 10:
+                                            final_templates[tpl].append(content)
+                                        # 注册元信息（占位符与其组正则）
+                                        if tpl not in templates_meta:
+                                            phs = []
+                                            for name, gp in zip(ar.get('placeholders', []), ar.get('group_patterns', [])):
+                                                phs.append({'name': name, 'pattern': gp})
+                                            templates_meta[tpl] = {
+                                                'placeholders': phs,
+                                                'source_regex': ar['source_regex']
+                                            }
+                                        matched_by_rule = True
+                                        break
+                                except re.error:
+                                    # 如果抽象规则本身有错误，跳过
+                                    continue

                        if matched_by_rule:
                            continue
@ -359,10 +449,33 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
        
        with open(output_file, 'w', encoding='utf-8') as f:
            for template, data_list in sorted(final_templates.items()):
-                if max_samples_per_template == 0:
-                    json.dump({"content": template}, f, ensure_ascii=False)
+                # 新格式：{"template":..., "regex":{name: pattern, ...}, "data":[...]}
+                output_obj = {'template': template}
+
+                # 构建 regex 映射：placeholder name -> pattern
+                regex_map = {}
+                if template in templates_meta:
+                    for ph in templates_meta[template].get('placeholders', []):
+                        if isinstance(ph, dict):
+                            regex_map[ph.get('name')] = ph.get('pattern')
+                        else:
+                            # ph 可能是名字字符串，尝试从全局映射取 pattern
+                            regex_map[ph] = PLACEHOLDER_PATTERNS.get(f'<{ph}>', '(.+?)')
+                    # 优先保留 source_regex as special key if present
+                    if templates_meta[template].get('source_regex'):
+                        output_obj['source_regex'] = templates_meta[template]['source_regex']
                else:
-                    json.dump({"content": template, "data": data_list[:max_samples_per_template]}, f, ensure_ascii=False)
+                    # 没有元信息时，尝试根据模板内占位符名用默认映射构建
+                    placeholders = re.findall(r'<([^>]+)>', template)
+                    for name in placeholders:
+                        regex_map[name] = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)')
+
+                output_obj['regex'] = regex_map
+
+                # 示例数据
+                output_obj['data'] = data_list[:max_samples_per_template] if max_samples_per_template != 0 else []
+
+                json.dump(output_obj, f, ensure_ascii=False)
                f.write('\n')
                
        print(f"所有模板已成功写入到 '{output_file}'。")
@ -378,14 +491,43 @@ def extract_values_with_templates(input_files, template_file, output_file, conte
    """
    print("--- 开始使用模板提取参数值 ---")
    
-    # 读取模板
-    templates = []
+    # 读取模板及其元信息。支持两种格式：旧的 {content, placeholders, source_regex} 或 新的 {template, regex, source_regex, data}
+    templates_meta = []
    with open(template_file, 'r', encoding='utf-8') as f:
        for line in f:
-            template_data = json.loads(line)
-            templates.append(template_data['content'])
+            try:
+                raw = json.loads(line)
+            except json.JSONDecodeError:
+                continue

-    print(f"已加载 {len(templates)} 个模板")
+            # 规范化到内部使用的 tmeta 格式：{ 'content': ..., 'placeholders': [ {name, pattern}, ... ], 'source_regex': ... }
+            tmeta = {}
+            if 'template' in raw:
+                tmeta['content'] = raw.get('template')
+                # raw may have 'regex' mapping name->pattern
+                regex_map = raw.get('regex', {})
+                phs = []
+                for name, pat in regex_map.items():
+                    phs.append({'name': name, 'pattern': pat})
+                # also consider raw['source_regex'] and raw['data'] if present
+                if 'source_regex' in raw:
+                    tmeta['source_regex'] = raw.get('source_regex')
+                if phs:
+                    tmeta['placeholders'] = phs
+            elif 'content' in raw:
+                # backward-compatible
+                tmeta['content'] = raw.get('content')
+                if 'placeholders' in raw:
+                    tmeta['placeholders'] = raw.get('placeholders')
+                if 'source_regex' in raw:
+                    tmeta['source_regex'] = raw.get('source_regex')
+            else:
+                # skip unknown format
+                continue
+
+            templates_meta.append(tmeta)
+
+    print(f"已加载 {len(templates_meta)} 个模板（含元信息/已规范化）")
    
    # 从原始数据中提取值
    extracted_values = []
@ -411,11 +553,52 @@ def extract_values_with_templates(input_files, template_file, output_file, conte
                        continue
                    
                    # 尝试匹配每个模板
-                    for template in templates:
-                        parameters = extract_parameters(template, content)
+                    for tmeta in templates_meta:
+                        # 构建用于匹配的正则
+                        try:
+                            regex_str = build_regex_from_template_data(tmeta)
+                            match = re.search(regex_str, content)
+                        except re.error:
+                            # 如果生成的正则有问题，退回到老方法
+                            match = None
+
+                        parameters = {}
+                        if match:
+                            # 如果有命名组，优先使用命名组
+                            if match.groupdict():
+                                # 将安全组名还原为原始占位符名（如果有元信息）
+                                gd = match.groupdict()
+                                # 如果模板元信息里有占位符名映射，则把原名映射回来
+                                ph_map = {}
+                                if 'placeholders' in tmeta and isinstance(tmeta['placeholders'], list):
+                                    for ph in tmeta['placeholders']:
+                                        if isinstance(ph, dict):
+                                            orig = ph.get('name')
+                                        else:
+                                            orig = ph
+                                        safe = re.sub(r"[^0-9A-Za-z_]", "_", orig)
+                                        ph_map[safe] = orig
+
+                                for safe_name, val in gd.items():
+                                    orig_name = ph_map.get(safe_name, safe_name)
+                                    parameters[f'<{orig_name}>'] = val
+                            else:
+                                # 否则使用位置组，借助模板中的占位符顺序
+                                values = match.groups()
+                                placeholders = re.findall(r'<[^>]+>', tmeta.get('content', ''))
+                                for i, placeholder in enumerate(placeholders):
+                                    if i < len(values):
+                                        parameters[placeholder] = values[i]
+
+                        else:
+                            # 退回老方法：通过模板字符串替换占位符生成正则
+                            tpl = tmeta.get('content')
+                            if tpl:
+                                parameters = extract_parameters(tpl, content)
+
                        if parameters:
                            extracted_values.append({
-                                'template': template,
+                                'template': tmeta.get('content'),
                                'message': content,
                                'parameters': parameters
                            })