commit 7778e2b431224edfca5b7f7db54c636d8d15dfe9 Author: eson <474420502@qq.com> Date: Mon Sep 8 23:40:28 2025 +0800 save diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..893a9d7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.7z +*.json +*.jsonl +*.txt +*.csv +*.log \ No newline at end of file diff --git a/sql_ml.py b/sql_ml.py new file mode 100644 index 0000000..b518c42 --- /dev/null +++ b/sql_ml.py @@ -0,0 +1,269 @@ +import json +import re +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.cluster import DBSCAN +from sklearn.metrics.pairwise import cosine_similarity +from tqdm import tqdm + +# --- 规则和辅助函数 (与之前相同) --- +PREDEFINED_RULES = [ + # {'name': 'Transfer', 'pattern': re.compile(r'^Transfer from \d+ to \d+$')}, + # {'name': 'International Remittance', 'pattern': re.compile(r'^International Remittance$')}, + # {'name': 'Bill Payment', 'pattern': re.compile(r'^Bill payment successful for amount \d+$')}, + # {'name': 'New Message', 'pattern': re.compile(r'^You have a new message from \d+\.$')}, + # # 新增规则:匹配类似 "Sent GCash to GoTyme Bank with account ending in 6784" + # {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')} +] + +def normalize_text(text): + # 模式 8: 从银行收款 (这条规则必须先运行) + text = re.sub( + r'Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', + r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>', + text + ) + + # 模式 13: 向未验证账户发送凭证 + # 结构: You have sent <货币> <金额> to an unverified account <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. Go to... + text = re.sub( + r'^You have sent PHP [\d,]+\.\d{2} to an unverified account \d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$', + r'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.', + text + ) + + # 模式 12: 详细发送凭证 (更新后可处理多段姓名) + # 结构: You have sent <货币> <金额> to <收款人> <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. + text = re.sub( + r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s\d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', + r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>.', + text + ) + + # 模式 11 (机构): 详细收款凭证 + # 结构: You have received ... of GCash from <来源>. Your new balance is ... <日期时间>. Ref. No. <流水号>. Use now to buy load... + text = re.sub( + r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$', + r'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <流水号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!', + text + ) + + # 模式 11 (个人): 详细收款凭证 (最终修正版,兼容多种手机号/余额/结尾格式) + text = re.sub( + r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$', + r'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', + text + ) + # 模式 12: 详细发送凭证 (最终修正版,兼容所有已知姓名格式) + text = re.sub( + r'^You have sent PHP [\d,]+\.\d{2} to .+? \d{11} on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', + r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', + text + ) + + # 模式 10 (来自用户最初的模板列表,这里将其具体化) + # 结构: You have paid <金额> via GCash to <接收方> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>. + text = re.sub( + r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$', + r'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.', + text + ) + + # 模式 9 (来自用户最初的模板列表,这里将其具体化) + # 结构: Sent GCash to <机构名> with account ending in <尾号> + text = re.sub( + r'Sent GCash to (.+?) with account ending in (\d+)$', + r'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>', + text + ) + + + + + # 新增规则:模式 7: 从一般来源收款 (这条规则紧随其后) + # 它只会处理没有被上面那条规则匹配到的 "Received GCash from..." + text = re.sub( + r'(?i)^Received GCash from .+$', + r'Received GCash from <付款人名称>', + text + ) + # 模式 6: 带商户交易单号的支付 + # 结构: Payment to <商户名>, Merchant Transaction Number: <交易单号> + text = re.sub( + r'Payment to (.+?), Merchant Transaction Number: (.+)$', + r'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>', + text + ) + + # 模式 5 (来自用户最初的模板列表,这里将其具体化) + # 结构: Payment to <商户名> + text = re.sub( + r'^Payment to (.+)$', + r'Payment to <收款人名称>', + text + ) + + text = re.sub( + r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$', + r'<交易类型> with <参考号类型> <参考号>', + text + ) + + text = re.sub(r'Sent GCash to <收款人名称> with account ending in (\d+)$', r'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>', text) + text = re.sub(r'^Transfer from \S+ to \S+$', r'Transfer from <付款人号码> to <收款人号码>', text) + # 模式 8: 从银行收款 + # 结构: Received GCash from <机构名> with account ending in <尾号> via <网络> or with invno:<...> + text = re.sub( + r'Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', + r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>', + text + ) + + # 新增规则:Buy Load Transaction + text = re.sub( + r'^Buy Load Transaction for .+$', + r'Buy Load Transaction for <付款人号码>', + text + ) + + # 新增规则:Refund + text = re.sub( + r'^Refund from .+$', + r'Refund from <收款人名称>', + text + ) + + return text + +def run_dbscan_on_corpus(corpus, eps, min_samples): + if not corpus: return set() + + processed_corpus = [normalize_text(text) for text in corpus] + + try: + vectorizer = TfidfVectorizer() + X = vectorizer.fit_transform(processed_corpus) + + db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', n_jobs=-1).fit(X) + labels = db.labels_ + + dbscan_templates = set() + unique_labels = set(labels) + + for label in unique_labels: + class_member_indices = np.where(labels == label)[0] + if label == -1: # 处理噪声点 + for idx in class_member_indices: + dbscan_templates.add(processed_corpus[idx]) + continue + + # 处理聚类 + cluster_vectors = X[class_member_indices] + centroid = np.asarray(cluster_vectors.mean(axis=0)) + similarities = cosine_similarity(cluster_vectors, centroid) + most_representative_idx_in_cluster = np.argmax(similarities) + original_corpus_idx = class_member_indices[most_representative_idx_in_cluster] + dbscan_templates.add(processed_corpus[original_corpus_idx]) + + return dbscan_templates + except ValueError: + # 如果批次中所有词都在停用词表中,TfidfVectorizer会报错 + print("警告: DBSCAN批次处理失败,可能因为内容过于单一或简短。将内容视为独立模板。") + return set(processed_corpus) + + +def extract_templates_iterative(input_file, output_file, rules, batch_size=1000, eps=0.4, min_samples=2): + """ + 使用小批量迭代的混合策略来提取模板。 + """ + print("--- 开始迭代式模板提取 ---") + final_templates = set() + unmatched_batch = [] + batch_num = 1 + + try: + print(f"步骤 1: 逐行处理 '{input_file}' 并动态构建模板库...") + with open(input_file, 'r', encoding='utf-8') as f: + total_lines = sum(1 for _ in f) + + with open(input_file, 'r', encoding='utf-8') as f: + for line in tqdm(f, total=total_lines, desc="主进程"): + try: + content = json.loads(line).get('content') + if not content: continue + + normalized_content = normalize_text(content) + + # 1. 检查是否匹配已发现的任何模板 + if normalized_content in final_templates: + continue + + # 2. 检查是否匹配预定义规则 + matched_by_rule = False + for rule in rules: + if rule['pattern'].match(content): + final_templates.add(normalized_content) + matched_by_rule = True + break + + if matched_by_rule: + continue + + # 3. 如果都未匹配,加入批处理列表 + unmatched_batch.append(content) + + # 4. 检查是否触发批处理 + if len(unmatched_batch) >= batch_size: + print(f"\n--- 处理批次 #{batch_num} (大小: {len(unmatched_batch)}) ---") + newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples) + + print(f"批次 #{batch_num}: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。") + final_templates.update(newly_found_templates) + print(f"当前总模板数: {len(final_templates)}") + + unmatched_batch.clear() + batch_num += 1 + + except (json.JSONDecodeError, AttributeError): + continue + + # --- 收尾处理 --- + print("\n--- 文件处理完毕,处理最后一批剩余内容 ---") + if unmatched_batch: + print(f"处理最后一个批次 (大小: {len(unmatched_batch)})") + newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples) + + print(f"最后一个批次: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。") + final_templates.update(newly_found_templates) + else: + print("没有剩余内容需要处理。") + + # --- 输出 --- + print("\n--- 第 3 部分: 合并结果并保存 ---") + print(f"总共找到 {len(final_templates)} 个唯一的模板。") + + with open(output_file, 'w', encoding='utf-8') as f: + for template in sorted(list(final_templates)): + json.dump({"content": template}, f, ensure_ascii=False) + f.write('\n') + + print(f"所有模板已成功写入到 '{output_file}'。") + + except FileNotFoundError: + print(f"错误:找不到输入文件 '{input_file}'。") + return + +# --- 使用示例 --- +# 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl' +input_jsonl_file = 'content_filtered.jsonl' +output_template_file = 'templates_iterative.txt' +BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整 + +extract_templates_iterative( + input_file=input_jsonl_file, + output_file=output_template_file, + rules=PREDEFINED_RULES, + batch_size=BATCH_PROCESSING_SIZE, + eps=0.4, + min_samples=5 # 稍微提高min_samples可以得到更可靠的模板 +) \ No newline at end of file