save

2025-09-09 01:32:20 +08:00 · 2025-09-09 01:32:20 +08:00 · 3d6e2d3ad7
commit 3d6e2d3ad7
parent 7778e2b431
2 changed files with 53 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@
 *.txt
 *.csv
 *.log
+__pycache__
--- a/sql_ml.py
+++ b/sql_ml.py
@ -19,7 +19,7 @@ PREDEFINED_RULES = [
 def normalize_text(text):
    # 模式 8: 从银行收款 (这条规则必须先运行)
    text = re.sub(
-        r'Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
+        r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
        r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>',
        text
    )
@ -27,7 +27,7 @@ def normalize_text(text):
  # 模式 13: 向未验证账户发送凭证
    # 结构: You have sent <货币> <金额> to an unverified account <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. Go to...
    text = re.sub(
-        r'^You have sent PHP [\d,]+\.\d{2} to an unverified account \d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$',
+        r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$',
        r'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.',
        text
    )
@ -35,7 +35,7 @@ def normalize_text(text):
    # 模式 12: 详细发送凭证 (更新后可处理多段姓名)
    # 结构: You have sent <货币> <金额> to <收款人> <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>.
    text = re.sub(
-        r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s\d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
+        r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s[\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
        r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>.',
        text
    )
@ -56,7 +56,7 @@ def normalize_text(text):
    )
    # 模式 12: 详细发送凭证 (最终修正版，兼容所有已知姓名格式)
    text = re.sub(
-        r'^You have sent PHP [\d,]+\.\d{2} to .+? \d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
+        r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
        r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
        text
    )   
@ -83,7 +83,7 @@ def normalize_text(text):
    # 新增规则：模式 7: 从一般来源收款 (这条规则紧随其后)
    # 它只会处理没有被上面那条规则匹配到的 "Received GCash from..."
    text = re.sub(
-        r'(?i)^Received GCash from .+$',
+        r'(?i)^Received GCash from [^<]+$',
        r'Received GCash from <付款人名称>',
        text
    )
@ -98,14 +98,14 @@ def normalize_text(text):
    # 模式 5 (来自用户最初的模板列表，这里将其具体化)
    # 结构: Payment to <商户名>
    text = re.sub(
-        r'^Payment to (.+)$',
+        r'^Payment to ([^,]+)$',
        r'Payment to <收款人名称>',
        text
    )

    text = re.sub(
        r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$',
-        r'<交易类型> with <参考号类型> <参考号>',
+        r'<交易类型> with Ref. no. <参考号>',
        text
    )

@ -114,7 +114,7 @@ def normalize_text(text):
         # 模式 8: 从银行收款
    # 结构: Received GCash from <机构名> with account ending in <尾号> via <网络> or with invno:<...>
    text = re.sub(
-        r'Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
+        r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
        r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>',
        text
    )
@ -135,8 +135,8 @@ def normalize_text(text):
    
    return  text

-def run_dbscan_on_corpus(corpus, eps, min_samples):
-    if not corpus: return set()
+def run_dbscan_on_corpus(corpus, eps, min_samples, max_samples=10):
+    if not corpus: return {}

    processed_corpus = [normalize_text(text) for text in corpus]
    
@ -147,14 +147,19 @@ def run_dbscan_on_corpus(corpus, eps, min_samples):
        db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', n_jobs=-1).fit(X)
        labels = db.labels_
        
-        dbscan_templates = set()
+        dbscan_templates = {}
        unique_labels = set(labels)

        for label in unique_labels:
            class_member_indices = np.where(labels == label)[0]
            if label == -1: # 处理噪声点
                for idx in class_member_indices:
-                    dbscan_templates.add(processed_corpus[idx])
+                    original = corpus[idx]
+                    normalized = processed_corpus[idx]
+                    if normalized not in dbscan_templates:
+                        dbscan_templates[normalized] = []
+                    if len(dbscan_templates[normalized]) < max_samples:
+                        dbscan_templates[normalized].append(original)
                continue

            # 处理聚类
@ -163,21 +168,23 @@ def run_dbscan_on_corpus(corpus, eps, min_samples):
            similarities = cosine_similarity(cluster_vectors, centroid)
            most_representative_idx_in_cluster = np.argmax(similarities)
            original_corpus_idx = class_member_indices[most_representative_idx_in_cluster]
-            dbscan_templates.add(processed_corpus[original_corpus_idx])
+            most_representative_normalized = processed_corpus[original_corpus_idx]
+            cluster_originals = [corpus[idx] for idx in class_member_indices]
+            dbscan_templates[most_representative_normalized] = cluster_originals[:max_samples]
            
        return dbscan_templates
    except ValueError:
        # 如果批次中所有词都在停用词表中，TfidfVectorizer会报错
        print("警告: DBSCAN批次处理失败，可能因为内容过于单一或简短。将内容视为独立模板。")
-        return set(processed_corpus)
+        return {processed_corpus[i]: [corpus[i]][:max_samples] for i in range(len(corpus))}


-def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, eps=0.4, min_samples=2):
+def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, eps=0.4, min_samples=2, max_samples_per_template=0):
    """
-    使用小批量迭代的混合策略来提取模板。
+    使用小批量迭代的混合策略来提取模板，并为每个模板收集最多10个原始数据集。
    """
    print("--- 开始迭代式模板提取 ---")
-    final_templates = set()
+    final_templates = {}  # template -> list of original contents
    unmatched_batch = []
    batch_num = 1

@ -196,13 +203,18 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
                    
                    # 1. 检查是否匹配已发现的任何模板
                    if normalized_content in final_templates:
+                        if len(final_templates[normalized_content]) < 10:
+                            final_templates[normalized_content].append(content)
                        continue
                    
                    # 2. 检查是否匹配预定义规则
                    matched_by_rule = False
                    for rule in rules:
                        if rule['pattern'].match(content):
-                            final_templates.add(normalized_content)
+                            if normalized_content not in final_templates:
+                                final_templates[normalized_content] = []
+                            if len(final_templates[normalized_content]) < 10:
+                                final_templates[normalized_content].append(content)
                            matched_by_rule = True
                            break
                    
@ -215,10 +227,15 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
                    # 4. 检查是否触发批处理
                    if len(unmatched_batch) >= batch_size:
                        print(f"\n--- 处理批次 #{batch_num} (大小: {len(unmatched_batch)}) ---")
-                        newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples)
+                        newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples, 10)
                        
                        print(f"批次 #{batch_num}: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。")
-                        final_templates.update(newly_found_templates)
+                        for template, originals in newly_found_templates.items():
+                            if template in final_templates:
+                                remaining = 10 - len(final_templates[template])
+                                final_templates[template].extend(originals[:remaining])
+                            else:
+                                final_templates[template] = originals[:10]
                        print(f"当前总模板数: {len(final_templates)}")
                        
                        unmatched_batch.clear()
@ -231,10 +248,15 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
        print("\n--- 文件处理完毕，处理最后一批剩余内容 ---")
        if unmatched_batch:
            print(f"处理最后一个批次 (大小: {len(unmatched_batch)})")
-            newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples)
+            newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples, 10)
            
            print(f"最后一个批次: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。")
-            final_templates.update(newly_found_templates)
+            for template, originals in newly_found_templates.items():
+                if template in final_templates:
+                    remaining = 10 - len(final_templates[template])
+                    final_templates[template].extend(originals[:remaining])
+                else:
+                    final_templates[template] = originals[:10]
        else:
            print("没有剩余内容需要处理。")
            
@ -243,8 +265,11 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
        print(f"总共找到 {len(final_templates)} 个唯一的模板。")
        
        with open(output_file, 'w', encoding='utf-8') as f:
-            for template in sorted(list(final_templates)):
-                json.dump({"content": template}, f, ensure_ascii=False)
+            for template, data_list in sorted(final_templates.items()):
+                if max_samples_per_template == 0:
+                    json.dump({"content": template}, f, ensure_ascii=False)
+                else:
+                    json.dump({"content": template, "data": data_list[:max_samples_per_template]}, f, ensure_ascii=False)
                f.write('\n')
                
        print(f"所有模板已成功写入到 '{output_file}'。")
@ -265,5 +290,6 @@ extract_templates_iterative(
    rules=PREDEFINED_RULES,
    batch_size=BATCH_PROCESSING_SIZE,
    eps=0.4,
-    min_samples=5 # 稍微提高min_samples可以得到更可靠的模板
+    min_samples=5, # 稍微提高min_samples可以得到更可靠的模板
+    max_samples_per_template=10  # 设置为正数以导出样本数据，0表示不导出
 )