update
This commit is contained in:
parent
254e7e79e9
commit
3fcb7d52aa
47
sql_ml.py
47
sql_ml.py
@ -18,61 +18,59 @@ ABSTRACT_RULES = [
|
|||||||
# 示例:把原始正则和占位符显式化
|
# 示例:把原始正则和占位符显式化
|
||||||
'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$',
|
'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$',
|
||||||
'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
|
'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
|
||||||
'placeholders': ['收款人名称', '银行4位数尾号']
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
# Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>
|
# Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>
|
||||||
'source_regex': r'(?i)^Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
|
'source_regex': r'(?i)^Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$',
|
||||||
'template': 'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>',
|
'template': 'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>',
|
||||||
'placeholders': ['付款人名称', '银行4位数尾号', '网络或发票号']
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
# Payment to <商户名>, Merchant Transaction Number: <交易单号>
|
# Payment to <商户名>, Merchant Transaction Number: <交易单号>
|
||||||
'source_regex': r'^Payment to (.+?), Merchant Transaction Number: (.+)$',
|
'source_regex': r'^Payment to (.+?), Merchant Transaction Number: (.+)$',
|
||||||
'template': 'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>',
|
'template': 'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>',
|
||||||
'placeholders': ['收款人名称', '交易单号']
|
|
||||||
},
|
},
|
||||||
# 以下条目为自动迁移自 normalize_text 的简单规则
|
# 以下条目为自动迁移自 normalize_text 的简单规则
|
||||||
{
|
{
|
||||||
'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \.\..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$',
|
'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \.\..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$',
|
||||||
'template': 'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.',
|
'template': 'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <交易单号>. Go to GCash Help Center to know how to secure your transactions.',
|
||||||
'placeholders': ['金额', '收款人号码', '日期', '时间', '消息', '金额', '流水号']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: .*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
|
'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: .*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$',
|
||||||
'template': 'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
|
'template': 'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <交易单号>.',
|
||||||
'placeholders': ['金额', '收款人名称', '收款人号码', '日期', '时间', '消息', '金额', '参考号']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$',
|
'source_regex': r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$',
|
||||||
'template': 'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <流水号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!',
|
'template': 'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <交易单号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!',
|
||||||
'placeholders': ['金额','付款人名称','金额','日期时间','流水号']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$',
|
'source_regex': r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$',
|
||||||
'template': 'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.',
|
'template': 'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <交易单号>.',
|
||||||
'placeholders': ['金额','付款人名称','消息','金额','参考号']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$',
|
'source_regex': r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$',
|
||||||
'template': 'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.',
|
'template': 'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <交易单号>. QRPH Invoice No. <流水号>.',
|
||||||
'placeholders': ['金额','收款人名称','日期时间','参考号','参考号']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'(?i)^Received GCash from [^<]+$',
|
'source_regex': r'(?i)^Received GCash from [^<]+$',
|
||||||
'template': 'Received GCash from <付款人名称>',
|
'template': 'Received GCash from <付款人名称>',
|
||||||
'placeholders': ['付款人名称']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^Payment to ([^,]+)$',
|
'source_regex': r'^Payment to ([^,]+)$',
|
||||||
'template': 'Payment to <收款人名称>',
|
'template': 'Payment to <收款人名称>',
|
||||||
'placeholders': ['收款人名称']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$',
|
'source_regex': r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$',
|
||||||
'template': '<交易类型> with Ref. no. <参考号>',
|
'template': '<交易类型> with Ref. no. <交易单号>',
|
||||||
'placeholders': ['交易类型','参考号']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^Buy Load Transaction for .+$',
|
'source_regex': r'^Buy Load Transaction for .+$',
|
||||||
'template': 'Buy Load Transaction for <付款人号码>',
|
'template': 'Buy Load Transaction for <付款人号码>',
|
||||||
'placeholders': ['付款人号码']
|
|
||||||
},{
|
},{
|
||||||
'source_regex': r'^Transfer from \S+ to \S+$',
|
'source_regex': r'^Transfer from \S+ to \S+$',
|
||||||
'template': 'Transfer from <付款人号码> to <收款人号码>',
|
'template': 'Transfer from <付款人号码> to <收款人号码>',
|
||||||
'placeholders': ['付款人号码', '收款人号码']
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -372,6 +370,11 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
|||||||
final_templates[tpl].append(content)
|
final_templates[tpl].append(content)
|
||||||
# 注册元信息(占位符与其组正则)
|
# 注册元信息(占位符与其组正则)
|
||||||
if tpl not in templates_meta:
|
if tpl not in templates_meta:
|
||||||
|
# 如果没有显式提供 placeholders,从模板字符串自动提取
|
||||||
|
placeholders = ar.get('placeholders')
|
||||||
|
if not placeholders:
|
||||||
|
placeholders = re.findall(r'<([^>]+)>', ar['template'])
|
||||||
|
|
||||||
# 尝试从 source_regex 中提取顶层捕获组模式并按占位符顺序配对
|
# 尝试从 source_regex 中提取顶层捕获组模式并按占位符顺序配对
|
||||||
try:
|
try:
|
||||||
group_patterns = extract_group_patterns_from_regex(ar['source_regex'])
|
group_patterns = extract_group_patterns_from_regex(ar['source_regex'])
|
||||||
@ -379,7 +382,7 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
|||||||
group_patterns = []
|
group_patterns = []
|
||||||
|
|
||||||
phs = []
|
phs = []
|
||||||
for idx, name in enumerate(ar.get('placeholders', [])):
|
for idx, name in enumerate(placeholders):
|
||||||
# 如果成功提取到对应的捕获组模式,则使用之,否则回退到通配 '(.+?)'
|
# 如果成功提取到对应的捕获组模式,则使用之,否则回退到通配 '(.+?)'
|
||||||
pat = group_patterns[idx] if idx < len(group_patterns) and group_patterns[idx] else '(.+?)'
|
pat = group_patterns[idx] if idx < len(group_patterns) and group_patterns[idx] else '(.+?)'
|
||||||
phs.append({'name': name, 'pattern': pat})
|
phs.append({'name': name, 'pattern': pat})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user