ml_data_template/sql_ml.py

655 lines
31 KiB
Python
Raw Normal View History

2025-09-08 23:40:28 +08:00
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
2025-09-09 01:40:25 +08:00
import argparse
2025-09-08 23:40:28 +08:00
# --- 规则和辅助函数 (与之前相同) ---
PREDEFINED_RULES = [
# {'name': 'Transfer', 'pattern': re.compile(r'^Transfer from \d+ to \d+$')},
# {'name': 'International Remittance', 'pattern': re.compile(r'^International Remittance$')},
# {'name': 'Bill Payment', 'pattern': re.compile(r'^Bill payment successful for amount \d+$')},
# {'name': 'New Message', 'pattern': re.compile(r'^You have a new message from \d+\.$')},
# # 新增规则:匹配类似 "Sent GCash to GoTyme Bank with account ending in 6784"
# {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')}
]
2025-09-09 15:56:47 +08:00
# 抽象化规则:将正则、模板和占位符/每组的正则一并声明
ABSTRACT_RULES = [
{
# 示例:把原始正则和占位符、每组的正则显式化
'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$',
'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
'placeholders': ['收款人名称', '银行4位数尾号'],
'group_patterns': ['(.+?)', '(\\d+)']
},
# 你可以在这里继续添加更多抽象规则
]
2025-09-09 01:40:25 +08:00
# 占位符到正则表达式的映射
PLACEHOLDER_PATTERNS = {
'<金额>': r'([\d,.]+)',
'<付款人名称>': r'(.+?)',
'<收款人名称>': r'(.+?)',
'<付款人号码>': r'([\d\w\+\-\(\)]+)',
'<收款人号码>': r'([\d\w\+\-\(\)]+)',
'<银行4位数尾号>': r'(\d{4})',
'<参考号>': r'(.+?)',
'<交易单号>': r'(.+?)',
'<日期时间>': r'(.+?)',
'<日期>': r'(\d{2}-\d{2}-\d{4})',
'<时间>': r'(\d{1,2}:\d{2}\s[AP]M)',
'<消息>': r'(.+?)',
'<流水号>': r'(.+?)',
'<网络或发票号>': r'(.+?)',
'<交易类型>': r'(.+?)',
}
2025-09-09 15:10:49 +08:00
def get_placeholder(action):
"""
根据JSON消息的action类型返回对应的占位符
"""
if 'Received' in action:
return '付款人号码'
elif 'Sent' in action:
return '收款人号码'
elif 'Refunded' in action:
return '付款人号码'
else:
return '付款人号码' # 默认值
2025-09-08 23:40:28 +08:00
def normalize_text(text):
# 模式 8: 从银行收款 (这条规则必须先运行)
text = re.sub(
2025-09-09 01:32:20 +08:00
r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
2025-09-08 23:40:28 +08:00
r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>',
text
)
# 模式 13: 向未验证账户发送凭证
# 结构: You have sent <货币> <金额> to an unverified account <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. Go to...
text = re.sub(
2025-09-09 01:32:20 +08:00
r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$',
2025-09-08 23:40:28 +08:00
r'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.',
text
)
# 模式 12: 详细发送凭证 (更新后可处理多段姓名)
# 结构: You have sent <货币> <金额> to <收款人> <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>.
text = re.sub(
2025-09-09 01:32:20 +08:00
r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s[\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
2025-09-08 23:40:28 +08:00
r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>.',
text
)
# 模式 11 (机构): 详细收款凭证
# 结构: You have received ... of GCash from <来源>. Your new balance is ... <日期时间>. Ref. No. <流水号>. Use now to buy load...
text = re.sub(
r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$',
r'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <流水号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!',
text
)
# 模式 11 (个人): 详细收款凭证 (最终修正版,兼容多种手机号/余额/结尾格式)
text = re.sub(
r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$',
r'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
text
)
# 模式 12: 详细发送凭证 (最终修正版,兼容所有已知姓名格式)
text = re.sub(
2025-09-09 01:32:20 +08:00
r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
2025-09-08 23:40:28 +08:00
r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
text
)
# 模式 10 (来自用户最初的模板列表,这里将其具体化)
# 结构: You have paid <金额> via GCash to <接收方> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.
text = re.sub(
r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$',
r'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.',
text
)
# 模式 9 (来自用户最初的模板列表,这里将其具体化)
# 结构: Sent GCash to <机构名> with account ending in <尾号>
text = re.sub(
r'Sent GCash to (.+?) with account ending in (\d+)$',
r'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
text
)
# 新增规则:模式 7: 从一般来源收款 (这条规则紧随其后)
# 它只会处理没有被上面那条规则匹配到的 "Received GCash from..."
text = re.sub(
2025-09-09 01:32:20 +08:00
r'(?i)^Received GCash from [^<]+$',
2025-09-08 23:40:28 +08:00
r'Received GCash from <付款人名称>',
text
)
# 模式 6: 带商户交易单号的支付
# 结构: Payment to <商户名>, Merchant Transaction Number: <交易单号>
text = re.sub(
r'Payment to (.+?), Merchant Transaction Number: (.+)$',
r'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>',
text
)
# 模式 5 (来自用户最初的模板列表,这里将其具体化)
# 结构: Payment to <商户名>
text = re.sub(
2025-09-09 01:32:20 +08:00
r'^Payment to ([^,]+)$',
2025-09-08 23:40:28 +08:00
r'Payment to <收款人名称>',
text
)
text = re.sub(
r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$',
2025-09-09 01:32:20 +08:00
r'<交易类型> with Ref. no. <参考号>',
2025-09-08 23:40:28 +08:00
text
)
text = re.sub(r'Sent GCash to <收款人名称> with account ending in (\d+)$', r'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>', text)
text = re.sub(r'^Transfer from \S+ to \S+$', r'Transfer from <付款人号码> to <收款人号码>', text)
# 模式 8: 从银行收款
# 结构: Received GCash from <机构名> with account ending in <尾号> via <网络> or with invno:<...>
text = re.sub(
2025-09-09 01:32:20 +08:00
r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
2025-09-08 23:40:28 +08:00
r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>',
text
)
# 新增规则Buy Load Transaction
text = re.sub(
r'^Buy Load Transaction for .+$',
r'Buy Load Transaction for <付款人号码>',
text
)
# 新增规则Refund
text = re.sub(
r'^Refund from .+$',
r'Refund from <收款人名称>',
text
)
2025-09-09 15:10:49 +08:00
# 新增规则统一处理所有JSON格式消息
# 匹配各种action类型Received money from, Sent money to, Received settlement from, Reversed settlement from等
text = re.sub(
r'\\\"(Received money from|Sent money to|Received settlement from|Reversed settlement from|Refunded money via|Sent money via)\\\",\\\"target\\\":\\\"(.+?)\\\"',
lambda m: f'\\\"{m.group(1)}\\\",\\\"target\\\":\\\"<{get_placeholder(m.group(1))}>\\\"',
text
)
2025-09-08 23:40:28 +08:00
return text
2025-09-09 01:40:25 +08:00
def template_to_regex(template):
"""
将模板转换为可用于提取参数的正则表达式
"""
# 转义模板中的特殊字符,但保留占位符
escaped_template = re.escape(template)
# 将占位符映射到对应的正则表达式捕获组
for placeholder, pattern in PLACEHOLDER_PATTERNS.items():
escaped_placeholder = re.escape(placeholder)
# 替换占位符为对应的捕获组
escaped_template = escaped_template.replace(escaped_placeholder, pattern)
return escaped_template
2025-09-09 15:56:47 +08:00
def build_regex_from_template_data(template_data):
"""
根据模板的元信息placeholders / group_patterns / source_regex构建一个用于匹配的正则表达式字符串
优先使用 template_data['source_regex']如果存在否则从 template_data['content'] + template_data['placeholders'] 生成带命名分组的正则
返回未编译的正则字符串
"""
# 如果有原始正则,直接使用(保持原样)
if 'source_regex' in template_data and template_data['source_regex']:
return template_data['source_regex']
template = template_data.get('content', '')
placeholders_meta = template_data.get('placeholders')
# 如果没有占位符元信息,退回到旧方法生成正则
if not placeholders_meta:
return template_to_regex(template)
# 转义模板的普通字符,保留占位符位置
escaped_template = re.escape(template)
def strip_outer_parens(pat: str) -> str:
pat = pat.strip()
if pat.startswith('(') and pat.endswith(')'):
return pat[1:-1]
return pat
# placeholders_meta 可能是 [{'name':..., 'pattern':...}, ...] 或 ['name1','name2']
for ph in placeholders_meta:
if isinstance(ph, dict):
name = ph.get('name')
pattern = ph.get('pattern', '.+?')
else:
# 旧格式:仅名字
name = ph
# 尝试从全局映射中找到默认pattern
pattern = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)')
# 清理 pattern 的外层括号以避免双重捕获
inner = strip_outer_parens(pattern)
# 命名捕获组 (保持组名为中文名,但 Python 的 (?P<name>) 要求组名为字母数字和下划线)
# 将占位符名中的非字母数字替换为下划线以构造合法的组名
safe_name = re.sub(r"[^0-9A-Za-z_]", "_", name)
named_group = f"(?P<{safe_name}>{inner})"
escaped_placeholder = re.escape(f'<{name}>')
escaped_template = escaped_template.replace(escaped_placeholder, named_group)
return escaped_template
2025-09-09 01:40:25 +08:00
def extract_parameters(template, message):
"""
从消息中提取参数值
"""
# 生成正则表达式
pattern = template_to_regex(template)
# 匹配消息
match = re.search(pattern, message)
if match:
# 获取所有捕获组
values = match.groups()
# 获取模板中的占位符
placeholders = re.findall(r'<[^>]+>', template)
# 创建参数字典
parameters = {}
for i, placeholder in enumerate(placeholders):
if i < len(values):
parameters[placeholder] = values[i]
return parameters
return {}
2025-09-09 01:32:20 +08:00
def run_dbscan_on_corpus(corpus, eps, min_samples, max_samples=10):
if not corpus: return {}
2025-09-08 23:40:28 +08:00
processed_corpus = [normalize_text(text) for text in corpus]
try:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_corpus)
db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', n_jobs=-1).fit(X)
labels = db.labels_
2025-09-09 01:32:20 +08:00
dbscan_templates = {}
2025-09-08 23:40:28 +08:00
unique_labels = set(labels)
for label in unique_labels:
class_member_indices = np.where(labels == label)[0]
if label == -1: # 处理噪声点
for idx in class_member_indices:
2025-09-09 01:32:20 +08:00
original = corpus[idx]
normalized = processed_corpus[idx]
if normalized not in dbscan_templates:
dbscan_templates[normalized] = []
if len(dbscan_templates[normalized]) < max_samples:
dbscan_templates[normalized].append(original)
2025-09-08 23:40:28 +08:00
continue
# 处理聚类
cluster_vectors = X[class_member_indices]
centroid = np.asarray(cluster_vectors.mean(axis=0))
similarities = cosine_similarity(cluster_vectors, centroid)
most_representative_idx_in_cluster = np.argmax(similarities)
original_corpus_idx = class_member_indices[most_representative_idx_in_cluster]
2025-09-09 01:32:20 +08:00
most_representative_normalized = processed_corpus[original_corpus_idx]
cluster_originals = [corpus[idx] for idx in class_member_indices]
dbscan_templates[most_representative_normalized] = cluster_originals[:max_samples]
2025-09-08 23:40:28 +08:00
return dbscan_templates
except ValueError:
# 如果批次中所有词都在停用词表中TfidfVectorizer会报错
print("警告: DBSCAN批次处理失败可能因为内容过于单一或简短。将内容视为独立模板。")
2025-09-09 01:32:20 +08:00
return {processed_corpus[i]: [corpus[i]][:max_samples] for i in range(len(corpus))}
2025-09-08 23:40:28 +08:00
2025-09-09 15:14:44 +08:00
def extract_templates_iterative(input_files, output_file, rules, batch_size=1000, eps=0.4, min_samples=2, max_samples_per_template=0, content_key='content'):
2025-09-08 23:40:28 +08:00
"""
2025-09-09 01:32:20 +08:00
使用小批量迭代的混合策略来提取模板并为每个模板收集最多10个原始数据集
2025-09-09 15:10:49 +08:00
支持多个输入文件
2025-09-08 23:40:28 +08:00
"""
print("--- 开始迭代式模板提取 ---")
2025-09-09 01:32:20 +08:00
final_templates = {} # template -> list of original contents
2025-09-09 15:56:47 +08:00
# 存储模板的元信息(占位符与其正则、原始正则等)
templates_meta = {} # template -> { 'placeholders': [ {name, pattern}, ... ], 'source_regex': str }
2025-09-08 23:40:28 +08:00
unmatched_batch = []
batch_num = 1
try:
2025-09-09 15:10:49 +08:00
print(f"步骤 1: 逐行处理输入文件 {input_files} 并动态构建模板库...")
total_lines = 0
for input_file in input_files:
with open(input_file, 'r', encoding='utf-8') as f:
total_lines += sum(1 for _ in f)
2025-09-08 23:40:28 +08:00
2025-09-09 15:10:49 +08:00
for input_file in input_files:
2025-09-09 15:14:44 +08:00
print(f"\n--- 开始处理文件: {input_file} ---")
# 计算当前文件的行数
2025-09-09 15:10:49 +08:00
with open(input_file, 'r', encoding='utf-8') as f:
2025-09-09 15:14:44 +08:00
file_lines = sum(1 for _ in f)
with open(input_file, 'r', encoding='utf-8') as f:
for line in tqdm(f, total=file_lines, desc=f"处理 {input_file.split('/')[-1]}"):
2025-09-09 15:10:49 +08:00
try:
2025-09-09 15:14:44 +08:00
data = json.loads(line)
content = data.get(content_key)
2025-09-09 15:10:49 +08:00
if not content: continue
normalized_content = normalize_text(content)
# 1. 检查是否匹配已发现的任何模板
if normalized_content in final_templates:
2025-09-09 01:32:20 +08:00
if len(final_templates[normalized_content]) < 10:
final_templates[normalized_content].append(content)
2025-09-09 15:10:49 +08:00
continue
2025-09-08 23:40:28 +08:00
2025-09-09 15:10:49 +08:00
# 2. 检查是否匹配预定义规则
matched_by_rule = False
for rule in rules:
if rule['pattern'].match(content):
if normalized_content not in final_templates:
final_templates[normalized_content] = []
if len(final_templates[normalized_content]) < 10:
final_templates[normalized_content].append(content)
matched_by_rule = True
break
2025-09-09 15:56:47 +08:00
# 额外检查抽象化规则source_regex + template + placeholders
if not matched_by_rule:
for ar in ABSTRACT_RULES:
try:
if re.match(ar['source_regex'], content):
tpl = ar['template']
if tpl not in final_templates:
final_templates[tpl] = []
if len(final_templates[tpl]) < 10:
final_templates[tpl].append(content)
# 注册元信息(占位符与其组正则)
if tpl not in templates_meta:
phs = []
for name, gp in zip(ar.get('placeholders', []), ar.get('group_patterns', [])):
phs.append({'name': name, 'pattern': gp})
templates_meta[tpl] = {
'placeholders': phs,
'source_regex': ar['source_regex']
}
matched_by_rule = True
break
except re.error:
# 如果抽象规则本身有错误,跳过
continue
2025-09-09 15:10:49 +08:00
if matched_by_rule:
continue
# 3. 如果都未匹配,加入批处理列表
unmatched_batch.append(content)
# 4. 检查是否触发批处理
if len(unmatched_batch) >= batch_size:
print(f"\n--- 处理批次 #{batch_num} (大小: {len(unmatched_batch)}) ---")
newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples, 10)
print(f"批次 #{batch_num}: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。")
for template, originals in newly_found_templates.items():
if template in final_templates:
remaining = 10 - len(final_templates[template])
final_templates[template].extend(originals[:remaining])
else:
final_templates[template] = originals[:10]
print(f"当前总模板数: {len(final_templates)}")
unmatched_batch.clear()
batch_num += 1
except (json.JSONDecodeError, AttributeError):
continue
2025-09-08 23:40:28 +08:00
# --- 收尾处理 ---
print("\n--- 文件处理完毕,处理最后一批剩余内容 ---")
if unmatched_batch:
print(f"处理最后一个批次 (大小: {len(unmatched_batch)})")
2025-09-09 01:32:20 +08:00
newly_found_templates = run_dbscan_on_corpus(unmatched_batch, eps, min_samples, 10)
2025-09-08 23:40:28 +08:00
print(f"最后一个批次: DBSCAN 发现了 {len(newly_found_templates)} 个潜在模板。")
2025-09-09 01:32:20 +08:00
for template, originals in newly_found_templates.items():
if template in final_templates:
remaining = 10 - len(final_templates[template])
final_templates[template].extend(originals[:remaining])
else:
final_templates[template] = originals[:10]
2025-09-08 23:40:28 +08:00
else:
print("没有剩余内容需要处理。")
# --- 输出 ---
print("\n--- 第 3 部分: 合并结果并保存 ---")
print(f"总共找到 {len(final_templates)} 个唯一的模板。")
with open(output_file, 'w', encoding='utf-8') as f:
2025-09-09 01:32:20 +08:00
for template, data_list in sorted(final_templates.items()):
2025-09-09 15:56:47 +08:00
# 新格式:{"template":..., "regex":{name: pattern, ...}, "data":[...]}
output_obj = {'template': template}
# 构建 regex 映射placeholder name -> pattern
regex_map = {}
if template in templates_meta:
for ph in templates_meta[template].get('placeholders', []):
if isinstance(ph, dict):
regex_map[ph.get('name')] = ph.get('pattern')
else:
# ph 可能是名字字符串,尝试从全局映射取 pattern
regex_map[ph] = PLACEHOLDER_PATTERNS.get(f'<{ph}>', '(.+?)')
# 优先保留 source_regex as special key if present
if templates_meta[template].get('source_regex'):
output_obj['source_regex'] = templates_meta[template]['source_regex']
2025-09-09 01:32:20 +08:00
else:
2025-09-09 15:56:47 +08:00
# 没有元信息时,尝试根据模板内占位符名用默认映射构建
placeholders = re.findall(r'<([^>]+)>', template)
for name in placeholders:
regex_map[name] = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)')
output_obj['regex'] = regex_map
# 示例数据
output_obj['data'] = data_list[:max_samples_per_template] if max_samples_per_template != 0 else []
json.dump(output_obj, f, ensure_ascii=False)
2025-09-08 23:40:28 +08:00
f.write('\n')
print(f"所有模板已成功写入到 '{output_file}'")
2025-09-09 15:10:49 +08:00
except FileNotFoundError as e:
print(f"错误:找不到输入文件 {e.filename}")
2025-09-08 23:40:28 +08:00
return
2025-09-09 15:14:44 +08:00
def extract_values_with_templates(input_files, template_file, output_file, content_key='content'):
2025-09-09 01:40:25 +08:00
"""
使用DBSCAN生成的模板从原始消息中提取参数值
2025-09-09 15:10:49 +08:00
支持多个输入文件
2025-09-09 01:40:25 +08:00
"""
print("--- 开始使用模板提取参数值 ---")
2025-09-09 15:56:47 +08:00
# 读取模板及其元信息。支持两种格式:旧的 {content, placeholders, source_regex} 或 新的 {template, regex, source_regex, data}
templates_meta = []
2025-09-09 01:40:25 +08:00
with open(template_file, 'r', encoding='utf-8') as f:
for line in f:
2025-09-09 15:56:47 +08:00
try:
raw = json.loads(line)
except json.JSONDecodeError:
continue
# 规范化到内部使用的 tmeta 格式:{ 'content': ..., 'placeholders': [ {name, pattern}, ... ], 'source_regex': ... }
tmeta = {}
if 'template' in raw:
tmeta['content'] = raw.get('template')
# raw may have 'regex' mapping name->pattern
regex_map = raw.get('regex', {})
phs = []
for name, pat in regex_map.items():
phs.append({'name': name, 'pattern': pat})
# also consider raw['source_regex'] and raw['data'] if present
if 'source_regex' in raw:
tmeta['source_regex'] = raw.get('source_regex')
if phs:
tmeta['placeholders'] = phs
elif 'content' in raw:
# backward-compatible
tmeta['content'] = raw.get('content')
if 'placeholders' in raw:
tmeta['placeholders'] = raw.get('placeholders')
if 'source_regex' in raw:
tmeta['source_regex'] = raw.get('source_regex')
else:
# skip unknown format
continue
templates_meta.append(tmeta)
print(f"已加载 {len(templates_meta)} 个模板(含元信息/已规范化)")
2025-09-09 01:40:25 +08:00
# 从原始数据中提取值
extracted_values = []
2025-09-09 15:10:49 +08:00
total_lines = 0
for input_file in input_files:
with open(input_file, 'r', encoding='utf-8') as f:
total_lines += sum(1 for _ in f)
2025-09-09 01:40:25 +08:00
2025-09-09 15:10:49 +08:00
for input_file in input_files:
2025-09-09 15:14:44 +08:00
print(f"\n--- 开始处理文件: {input_file} ---")
# 计算当前文件的行数
2025-09-09 15:10:49 +08:00
with open(input_file, 'r', encoding='utf-8') as f:
2025-09-09 15:14:44 +08:00
file_lines = sum(1 for _ in f)
with open(input_file, 'r', encoding='utf-8') as f:
for line in tqdm(f, total=file_lines, desc=f"提取 {input_file.split('/')[-1]}"):
2025-09-09 15:10:49 +08:00
try:
data = json.loads(line)
2025-09-09 15:14:44 +08:00
content = data.get(content_key, '')
2025-09-09 15:10:49 +08:00
if not content:
continue
# 尝试匹配每个模板
2025-09-09 15:56:47 +08:00
for tmeta in templates_meta:
# 构建用于匹配的正则
try:
regex_str = build_regex_from_template_data(tmeta)
match = re.search(regex_str, content)
except re.error:
# 如果生成的正则有问题,退回到老方法
match = None
parameters = {}
if match:
# 如果有命名组,优先使用命名组
if match.groupdict():
# 将安全组名还原为原始占位符名(如果有元信息)
gd = match.groupdict()
# 如果模板元信息里有占位符名映射,则把原名映射回来
ph_map = {}
if 'placeholders' in tmeta and isinstance(tmeta['placeholders'], list):
for ph in tmeta['placeholders']:
if isinstance(ph, dict):
orig = ph.get('name')
else:
orig = ph
safe = re.sub(r"[^0-9A-Za-z_]", "_", orig)
ph_map[safe] = orig
for safe_name, val in gd.items():
orig_name = ph_map.get(safe_name, safe_name)
parameters[f'<{orig_name}>'] = val
else:
# 否则使用位置组,借助模板中的占位符顺序
values = match.groups()
placeholders = re.findall(r'<[^>]+>', tmeta.get('content', ''))
for i, placeholder in enumerate(placeholders):
if i < len(values):
parameters[placeholder] = values[i]
else:
# 退回老方法:通过模板字符串替换占位符生成正则
tpl = tmeta.get('content')
if tpl:
parameters = extract_parameters(tpl, content)
2025-09-09 15:10:49 +08:00
if parameters:
extracted_values.append({
2025-09-09 15:56:47 +08:00
'template': tmeta.get('content'),
2025-09-09 15:10:49 +08:00
'message': content,
'parameters': parameters
})
# 找到匹配就跳出循环
break
except (json.JSONDecodeError, Exception):
2025-09-09 01:40:25 +08:00
continue
# 保存提取的值
with open(output_file, 'w', encoding='utf-8') as f:
for item in extracted_values:
json.dump(item, f, ensure_ascii=False)
f.write('\n')
print(f"成功从 {len(extracted_values)} 条消息中提取参数,并保存到 '{output_file}'")
2025-09-08 23:40:28 +08:00
# --- 使用示例 ---
# 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl'
2025-09-09 15:10:49 +08:00
input_jsonl_files = ['content_filtered.jsonl', 'output.jsonl'] # 默认单个文件,可扩展为多个
2025-09-08 23:40:28 +08:00
output_template_file = 'templates_iterative.txt'
BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整
2025-09-09 01:40:25 +08:00
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract templates from GCash transaction data.')
2025-09-09 15:10:49 +08:00
parser.add_argument('--input_file', type=str, nargs='+', default=input_jsonl_files, help='Input JSONL file paths (multiple files supported)')
2025-09-09 01:40:25 +08:00
parser.add_argument('--output_file', type=str, default=output_template_file, help='Output template file path')
parser.add_argument('--batch_size', type=int, default=BATCH_PROCESSING_SIZE, help='Batch processing size (data volume)')
parser.add_argument('--eps', type=float, default=0.4, help='DBSCAN eps parameter')
parser.add_argument('--min_samples', type=int, default=5, help='DBSCAN min_samples parameter')
parser.add_argument('--extract_values', action='store_true', help='Extract values using generated templates')
2025-09-09 15:14:44 +08:00
parser.add_argument('--content_key', type=str, default='content', help='Key to extract content from JSON objects (default: content)')
2025-09-09 01:40:25 +08:00
args = parser.parse_args()
if args.extract_values:
# 执行参数提取
extract_values_with_templates(
2025-09-09 15:10:49 +08:00
input_files=args.input_file,
template_file='templates_iterative.txt',
2025-09-09 15:14:44 +08:00
output_file=args.output_file,
content_key=args.content_key
2025-09-09 01:40:25 +08:00
)
else:
# 执行模板提取
extract_templates_iterative(
2025-09-09 15:10:49 +08:00
input_files=args.input_file,
2025-09-09 01:40:25 +08:00
output_file=args.output_file,
rules=PREDEFINED_RULES,
batch_size=args.batch_size,
eps=args.eps,
2025-09-09 15:14:44 +08:00
min_samples=args.min_samples,
content_key=args.content_key
2025-09-09 01:40:25 +08:00
)