diff --git a/.gitignore b/.gitignore index 893a9d7..81d0df6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *.jsonl *.txt *.csv -*.log \ No newline at end of file +*.log +__pycache__ \ No newline at end of file diff --git a/sql_ml.py b/sql_ml.py index b518c42..d0288bb 100644 --- a/sql_ml.py +++ b/sql_ml.py @@ -19,7 +19,7 @@ PREDEFINED_RULES = [ def normalize_text(text): # 模式 8: 从银行收款 (这条规则必须先运行) text = re.sub( - r'Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', + r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>', text ) @@ -27,7 +27,7 @@ def normalize_text(text): # 模式 13: 向未验证账户发送凭证 # 结构: You have sent <货币> <金额> to an unverified account <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. Go to... text = re.sub( - r'^You have sent PHP [\d,]+\.\d{2} to an unverified account \d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$', + r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$', r'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.', text ) @@ -35,7 +35,7 @@ def normalize_text(text): # 模式 12: 详细发送凭证 (更新后可处理多段姓名) # 结构: You have sent <货币> <金额> to <收款人> <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. text = re.sub( - r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s\d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', + r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s[\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>.', text ) @@ -56,7 +56,7 @@ def normalize_text(text): ) # 模式 12: 详细发送凭证 (最终修正版,兼容所有已知姓名格式) text = re.sub( - r'^You have sent PHP [\d,]+\.\d{2} to .+? \d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', + r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', text ) @@ -83,7 +83,7 @@ def normalize_text(text): # 新增规则:模式 7: 从一般来源收款 (这条规则紧随其后) # 它只会处理没有被上面那条规则匹配到的 "Received GCash from..." text = re.sub( - r'(?i)^Received GCash from .+$', + r'(?i)^Received GCash from [^<]+$', r'Received GCash from <付款人名称>', text ) @@ -98,14 +98,14 @@ def normalize_text(text): # 模式 5 (来自用户最初的模板列表,这里将其具体化) # 结构: Payment to <商户名> text = re.sub( - r'^Payment to (.+)$', + r'^Payment to ([^,]+)$', r'Payment to <收款人名称>', text ) text = re.sub( r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$', - r'<交易类型> with <参考号类型> <参考号>', + r'<交易类型> with Ref. no. <参考号>', text ) @@ -114,7 +114,7 @@ def normalize_text(text): # 模式 8: 从银行收款 # 结构: Received GCash from <机构名> with account ending in <尾号> via <网络> or with invno:<...> text = re.sub( - r'Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', + r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>', text ) @@ -135,8 +135,8 @@ def normalize_text(text): return text -def run_dbscan_on_corpus(corpus, eps, min_samples): - if not corpus: return set() +def run_dbscan_on_corpus(corpus, eps, min_samples, max_samples=10): + if not corpus: return {} processed_corpus = [normalize_text(text) for text in corpus] @@ -147,14 +147,19 @@ def run_dbscan_on_corpus(corpus, eps, min_samples): db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', n_jobs=-1).fit(X) labels = db.labels_ - dbscan_templates = set() + dbscan_templates = {} unique_labels = set(labels) for label in unique_labels: class_member_indices = np.where(labels == label)[0] if label == -1: # 处理噪声点 for idx in class_member_indices: - dbscan_templates.add(processed_corpus[idx]) + original = corpus[idx] + normalized = processed_corpus[idx] + if normalized not in dbscan_templates: + dbscan_templates[normalized] = [] + if len(dbscan_templates[normalized]) < max_samples: + dbscan_templates[normalized].append(original) continue # 处理聚类 @@ -163,21 +168,23 @@ def run_dbscan_on_corpus(corpus, eps, min_samples): similarities = cosine_similarity(cluster_vectors, centroid) most_representative_idx_in_cluster = np.argmax(similarities) original_corpus_idx = class_member_indices[most_representative_idx_in_cluster] - dbscan_templates.add(processed_corpus[original_corpus_idx]) + most_representative_normalized = processed_corpus[original_corpus_idx] + cluster_originals = [corpus[idx] for idx in class_member_indices] + dbscan_templates[most_representative_normalized] = cluster_originals[:max_samples] return dbscan_templates except ValueError: # 如果批次中所有词都在停用词表中,TfidfVectorizer会报错 print("警告: DBSCAN批次处理失败,可能因为内容过于单一或简短。将内容视为独立模板。") - return set(processed_corpus) + return {processed_corpus[i]: [corpus[i]][:max_samples] for i in range(len(corpus))} -def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, eps=0.4, min_samples=2): +def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, eps=0.4, min_samples=2, max_samples_per_template=0): """ - 使用小批量迭代的混合策略来提取模板。 + 使用小批量迭代的混合策略来提取模板,并为每个模板收集最多10个原始数据集。 """ print("--- 开始迭代式模板提取 ---") - final_templates = set() + final_templates = {} # template -> list of original contents unmatched_batch = [] batch_num = 1 @@ -196,13 +203,18 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, # 1. 检查是否匹配已发现的任何模板 if normalized_content in final_templates: + if len(final_templates[normalized_content]) < 10: + final_templates[normalized_content].append(content) continue # 2. 检查是否匹配预定义规则 matched_by_rule = False for rule in rules: if rule['pattern'].match(content): - final_templates.add(normalized_content) + if normalized_content not in final_templates: + final_templates[normalized_content] = [] + if len(final_templates[normalized_content]) < 10: + final_templates[normalized_content].append(content) matched_by_rule = True break @@ -215,10 +227,15 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, # 4. 检查是否触发批处理 if len(unmatched_batch) >= batch_size: print(f"\n--- 处理批次 #{batch_num} (大小: {len(unmatched_batch)}) ---") - newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples) + newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples, 10) print(f"批次 #{batch_num}: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。") - final_templates.update(newly_found_templates) + for template, originals in newly_found_templates.items(): + if template in final_templates: + remaining = 10 - len(final_templates[template]) + final_templates[template].extend(originals[:remaining]) + else: + final_templates[template] = originals[:10] print(f"当前总模板数: {len(final_templates)}") unmatched_batch.clear() @@ -231,10 +248,15 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, print("\n--- 文件处理完毕,处理最后一批剩余内容 ---") if unmatched_batch: print(f"处理最后一个批次 (大小: {len(unmatched_batch)})") - newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples) + newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples, 10) print(f"最后一个批次: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。") - final_templates.update(newly_found_templates) + for template, originals in newly_found_templates.items(): + if template in final_templates: + remaining = 10 - len(final_templates[template]) + final_templates[template].extend(originals[:remaining]) + else: + final_templates[template] = originals[:10] else: print("没有剩余内容需要处理。") @@ -243,8 +265,11 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, print(f"总共找到 {len(final_templates)} 个唯一的模板。") with open(output_file, 'w', encoding='utf-8') as f: - for template in sorted(list(final_templates)): - json.dump({"content": template}, f, ensure_ascii=False) + for template, data_list in sorted(final_templates.items()): + if max_samples_per_template == 0: + json.dump({"content": template}, f, ensure_ascii=False) + else: + json.dump({"content": template, "data": data_list[:max_samples_per_template]}, f, ensure_ascii=False) f.write('\n') print(f"所有模板已成功写入到 '{output_file}'。") @@ -265,5 +290,6 @@ extract_templates_iterative( rules=PREDEFINED_RULES, batch_size=BATCH_PROCESSING_SIZE, eps=0.4, - min_samples=5 # 稍微提高min_samples可以得到更可靠的模板 + min_samples=5, # 稍微提高min_samples可以得到更可靠的模板 + max_samples_per_template=10 # 设置为正数以导出样本数据,0表示不导出 ) \ No newline at end of file