This commit is contained in:
dev07 2025-09-09 18:35:41 +08:00
parent 254e7e79e9
commit 3fcb7d52aa

View File

@ -18,61 +18,59 @@ ABSTRACT_RULES = [
# 示例:把原始正则和占位符显式化
'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$',
'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
'placeholders': ['收款人名称', '银行4位数尾号']
},
{
# Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>
# Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>
'source_regex': r'(?i)^Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
'template': 'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>',
'placeholders': ['付款人名称', '银行4位数尾号', '网络或发票号']
'template': 'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>',
},
{
# Payment to <商户名>, Merchant Transaction Number: <交易单号>
'source_regex': r'^Payment to (.+?), Merchant Transaction Number: (.+)$',
'template': 'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>',
'placeholders': ['收款人名称', '交易单号']
},
# 以下条目为自动迁移自 normalize_text 的简单规则
{
'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \.\..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$',
'template': 'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.',
'placeholders': ['金额', '收款人号码', '日期', '时间', '消息', '金额', '流水号']
'template': 'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <交易单号>. Go to GCash Help Center to know how to secure your transactions.',
},{
'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: .*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
'template': 'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
'placeholders': ['金额', '收款人名称', '收款人号码', '日期', '时间', '消息', '金额', '参考号']
'template': 'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <交易单号>.',
},{
'source_regex': r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$',
'template': 'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <流水号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!',
'placeholders': ['金额','付款人名称','金额','日期时间','流水号']
'template': 'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <交易单号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!',
},{
'source_regex': r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$',
'template': 'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
'placeholders': ['金额','付款人名称','消息','金额','参考号']
'template': 'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <交易单号>.',
},{
'source_regex': r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$',
'template': 'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.',
'placeholders': ['金额','收款人名称','日期时间','参考号','参考号']
'template': 'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <交易单号>. QRPH Invoice No. <流水号>.',
},{
'source_regex': r'(?i)^Received GCash from [^<]+$',
'template': 'Received GCash from <付款人名称>',
'placeholders': ['付款人名称']
},{
'source_regex': r'^Payment to ([^,]+)$',
'template': 'Payment to <收款人名称>',
'placeholders': ['收款人名称']
},{
'source_regex': r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$',
'template': '<交易类型> with Ref. no. <参考号>',
'placeholders': ['交易类型','参考号']
'template': '<交易类型> with Ref. no. <交易单号>',
},{
'source_regex': r'^Buy Load Transaction for .+$',
'template': 'Buy Load Transaction for <付款人号码>',
'placeholders': ['付款人号码']
},{
'source_regex': r'^Transfer from \S+ to \S+$',
'template': 'Transfer from <付款人号码> to <收款人号码>',
'placeholders': ['付款人号码', '收款人号码']
}
]
@ -372,6 +370,11 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
final_templates[tpl].append(content)
# 注册元信息(占位符与其组正则)
if tpl not in templates_meta:
# 如果没有显式提供 placeholders从模板字符串自动提取
placeholders = ar.get('placeholders')
if not placeholders:
placeholders = re.findall(r'<([^>]+)>', ar['template'])
# 尝试从 source_regex 中提取顶层捕获组模式并按占位符顺序配对
try:
group_patterns = extract_group_patterns_from_regex(ar['source_regex'])
@ -379,7 +382,7 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
group_patterns = []
phs = []
for idx, name in enumerate(ar.get('placeholders', [])):
for idx, name in enumerate(placeholders):
# 如果成功提取到对应的捕获组模式,则使用之,否则回退到通配 '(.+?)'
pat = group_patterns[idx] if idx < len(group_patterns) and group_patterns[idx] else '(.+?)'
phs.append({'name': name, 'pattern': pat})