From acaf55c728c2a124e59f36abf4eae5daf2eb9636 Mon Sep 17 00:00:00 2001 From: dev07 Date: Tue, 9 Sep 2025 17:35:41 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B2=8C=E4=BC=BC=E5=8F=AF=E4=BB=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sql_ml.py | 353 +++++++++++++++++++++++++++--------------------------- 1 file changed, 174 insertions(+), 179 deletions(-) diff --git a/sql_ml.py b/sql_ml.py index 8293b90..b37ce18 100644 --- a/sql_ml.py +++ b/sql_ml.py @@ -7,46 +7,124 @@ from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm import argparse -# --- 规则和辅助函数 (与之前相同) --- +# --- 规则和辅助函数 --- PREDEFINED_RULES = [ - # {'name': 'Transfer', 'pattern': re.compile(r'^Transfer from \d+ to \d+$')}, - # {'name': 'International Remittance', 'pattern': re.compile(r'^International Remittance$')}, - # {'name': 'Bill Payment', 'pattern': re.compile(r'^Bill payment successful for amount \d+$')}, - # {'name': 'New Message', 'pattern': re.compile(r'^You have a new message from \d+\.$')}, - # # 新增规则:匹配类似 "Sent GCash to GoTyme Bank with account ending in 6784" - # {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')} + ] # 抽象化规则:将正则、模板和占位符/每组的正则一并声明 ABSTRACT_RULES = [ { - # 示例:把原始正则和占位符、每组的正则显式化 + # 示例:把原始正则和占位符显式化 'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$', 'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>', - 'placeholders': ['收款人名称', '银行4位数尾号'], - 'group_patterns': ['(.+?)', '(\\d+)'] + 'placeholders': ['收款人名称', '银行4位数尾号'] }, - # 你可以在这里继续添加更多抽象规则 + { + # Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号> + 'source_regex': r'(?i)^Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', + 'template': 'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>', + 'placeholders': ['付款人名称', '银行4位数尾号', '网络或发票号'] + }, + { + # Payment to <商户名>, Merchant Transaction Number: <交易单号> + 'source_regex': r'^Payment to (.+?), Merchant Transaction Number: (.+)$', + 'template': 'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>', + 'placeholders': ['收款人名称', '交易单号'] + }, + # 以下条目为自动迁移自 normalize_text 的简单规则 + { + 'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \.\..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$', + 'template': 'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.', + 'placeholders': ['金额', '收款人号码', '日期', '时间', '消息', '金额', '流水号'] + },{ + 'source_regex': r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: .*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', + 'template': 'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', + 'placeholders': ['金额', '收款人名称', '收款人号码', '日期', '时间', '消息', '金额', '参考号'] + },{ + 'source_regex': r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$', + 'template': 'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <流水号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!', + 'placeholders': ['金额','付款人名称','金额','日期时间','流水号'] + },{ + 'source_regex': r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$', + 'template': 'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', + 'placeholders': ['金额','付款人名称','消息','金额','参考号'] + },{ + 'source_regex': r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$', + 'template': 'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.', + 'placeholders': ['金额','收款人名称','日期时间','参考号','参考号'] + },{ + 'source_regex': r'(?i)^Received GCash from [^<]+$', + 'template': 'Received GCash from <付款人名称>', + 'placeholders': ['付款人名称'] + },{ + 'source_regex': r'^Payment to ([^,]+)$', + 'template': 'Payment to <收款人名称>', + 'placeholders': ['收款人名称'] + },{ + 'source_regex': r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$', + 'template': '<交易类型> with Ref. no. <参考号>', + 'placeholders': ['交易类型','参考号'] + },{ + 'source_regex': r'^Buy Load Transaction for .+$', + 'template': 'Buy Load Transaction for <付款人号码>', + 'placeholders': ['付款人号码'] + },{ + 'source_regex': r'^Transfer from \S+ to \S+$', + 'template': 'Transfer from <付款人号码> to <收款人号码>', + 'placeholders': ['付款人号码', '收款人号码'] + } ] -# 占位符到正则表达式的映射 -PLACEHOLDER_PATTERNS = { - '<金额>': r'([\d,.]+)', - '<付款人名称>': r'(.+?)', - '<收款人名称>': r'(.+?)', - '<付款人号码>': r'([\d\w\+\-\(\)]+)', - '<收款人号码>': r'([\d\w\+\-\(\)]+)', - '<银行4位数尾号>': r'(\d{4})', - '<参考号>': r'(.+?)', - '<交易单号>': r'(.+?)', - '<日期时间>': r'(.+?)', - '<日期>': r'(\d{2}-\d{2}-\d{4})', - '<时间>': r'(\d{1,2}:\d{2}\s[AP]M)', - '<消息>': r'(.+?)', - '<流水号>': r'(.+?)', - '<网络或发票号>': r'(.+?)', - '<交易类型>': r'(.+?)', -} +# 注:不再使用全局 PLACEHOLDER_PATTERNS 作为主要来源。 +# 如果需要回退模式,会使用通用 '(.+?)' 作为默认。 + +def extract_group_patterns_from_regex(regex_str: str): + """ + 从一个正则表达式字符串中提取按出现顺序的顶层捕获组的内部模式(不含最外层括号)。 + + 说明与限制: + - 只提取按文本顺序出现的捕获组 (包括命名组 (?P...)),跳过非捕获组 (?:...), lookaround 等。 + - 对非常复杂或不规则的正则(条件组、嵌套的命名/非命名混合、内联 flags 等)不能保证 100% 正确,建议对特殊规则手动验证/修正。 + 返回值:字符串列表,例如 ['\\d{4}', '.+?'](不包含外层括号)。 + """ + s = regex_str + n = len(s) + i = 0 + stack = [] # each item: (start_index, is_capturing) + results = [] + while i < n: + ch = s[i] + if ch == '\\': + # skip escaped char + i += 2 + continue + if ch == '(': + # lookahead to decide capturing vs non-capturing + is_capturing = True + if i + 1 < n and s[i+1] == '?': + # (?P... ) is a capturing named group + if i + 2 < n and s[i+2] == 'P': + is_capturing = True + else: + # other (?...) forms are non-capturing or lookaround + is_capturing = False + stack.append((i, is_capturing)) + i += 1 + continue + if ch == ')': + if not stack: + i += 1 + continue + start, is_capturing = stack.pop() + if is_capturing: + inner = s[start+1:i] + # strip surrounding whitespace + results.append(inner) + i += 1 + continue + i += 1 + return results def get_placeholder(action): """ @@ -62,131 +140,23 @@ def get_placeholder(action): return '付款人号码' # 默认值 def normalize_text(text): - # 模式 8: 从银行收款 (这条规则必须先运行) - text = re.sub( - r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', - r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <网络或发票号>', - text - ) + # 先统一应用 ABSTRACT_RULES 中的简单替换规则(已迁移) + for ar in ABSTRACT_RULES: + try: + if ar.get('source_regex') and ar.get('template'): + text = re.sub(ar['source_regex'], ar['template'], text) + except re.error: + # 跳过不合法的正则 + continue - # 模式 13: 向未验证账户发送凭证 - # 结构: You have sent <货币> <金额> to an unverified account <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. Go to... - text = re.sub( - r'^You have sent PHP [\d,]+\.\d{2} to an unverified account [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\. Go to GCash Help Center to know how to secure your transactions\.$', - r'You have sent PHP <金额> to an unverified account <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>. Go to GCash Help Center to know how to secure your transactions.', - text - ) - - # 模式 12: 详细发送凭证 (更新后可处理多段姓名) - # 结构: You have sent <货币> <金额> to <收款人> <手机号> on <日期> <时间> with MSG: <消息>. Your new balance is <货币> <金额>. Ref. No. <流水号>. - text = re.sub( - r'^You have sent PHP [\d,]+\.\d{2} to (?:[A-Z\*]+\s)+[A-Z\*]\.\s[\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', - r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <流水号>.', - text - ) - - # 模式 11 (机构): 详细收款凭证 - # 结构: You have received ... of GCash from <来源>. Your new balance is ... <日期时间>. Ref. No. <流水号>. Use now to buy load... - text = re.sub( - r'^You have received\s+(?:PHP\s+)?[\d,.]+\s+of GCash from\s+.+?\. Your new balance is\s+(?:PHP\s+)?[\d,.]+\s+\d{1,2}-\d{1,2}-\d{2,4}\s+\d{1,2}:\d{1,2}(?::\d{1,2})?\s+[AP]M\. Ref\. No\.\s+.+?\. Use now to buy load, purchase items, send money, pay bills, and a lot more!$', - r'You have received <金额> of GCash from <付款人名称>. Your new balance is <金额>. <日期时间>. Ref. No. <流水号>. Use now to buy load, purchase items, send money, pay bills, and a lot more!', - text - ) - - # 模式 11 (个人): 详细收款凭证 (最终修正版,兼容多种手机号/余额/结尾格式) - text = re.sub( - r'^You have received PHP [\d,.]+\s+of GCash from .+? w/ MSG: .*\. (?:Your new balance is PHP [\d,.]*\.\s)?Ref\. No\. \d+\.(?: To access your funds,.*)?$', - r'You have received PHP <金额> of GCash from <付款人名称> w/ MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', - text - ) - # 模式 12: 详细发送凭证 (最终修正版,兼容所有已知姓名格式) - text = re.sub( - r'^You have sent PHP [\d,]+\.\d{2} to .+? [\d\w\+\-\(\)]+ on \d{2}-\d{2}-\d{4}\s\d{1,2}:\d{2}\s[AP]M with MSG: \..*? Your new balance is PHP [\d,]+\.\d{2}\. Ref\. No\. \d+\.$', - r'You have sent PHP <金额> to <收款人名称> <收款人号码> on <日期> <时间> with MSG: <消息>. Your new balance is PHP <金额>. Ref. No. <参考号>.', - text - ) - - # 模式 10 (来自用户最初的模板列表,这里将其具体化) - # 结构: You have paid <金额> via GCash to <接收方> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>. - text = re.sub( - r'^You have paid P[\d,.]+\s+via GCash to .+? on \d{1,2}-\d{1,2}-\d{2,4}\s\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\. Ref\. No\.\s+\d+\. QRPH Invoice No\.\s+\d+\.$', - r'You have paid P<金额> via GCash to <收款人名称> on <日期时间>. Ref. No. <参考号>. QRPH Invoice No. <参考号>.', - text - ) - - # 模式 9 (来自用户最初的模板列表,这里将其具体化) - # 结构: Sent GCash to <机构名> with account ending in <尾号> - text = re.sub( - r'Sent GCash to (.+?) with account ending in (\d+)$', - r'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>', - text - ) - - - - - # 新增规则:模式 7: 从一般来源收款 (这条规则紧随其后) - # 它只会处理没有被上面那条规则匹配到的 "Received GCash from..." - text = re.sub( - r'(?i)^Received GCash from [^<]+$', - r'Received GCash from <付款人名称>', - text - ) - # 模式 6: 带商户交易单号的支付 - # 结构: Payment to <商户名>, Merchant Transaction Number: <交易单号> - text = re.sub( - r'Payment to (.+?), Merchant Transaction Number: (.+)$', - r'Payment to <收款人名称>, Merchant Transaction Number: <交易单号>', - text - ) - - # 模式 5 (来自用户最初的模板列表,这里将其具体化) - # 结构: Payment to <商户名> - text = re.sub( - r'^Payment to ([^,]+)$', - r'Payment to <收款人名称>', - text - ) - - text = re.sub( - r'^(.+?) with (Ref\. no\.|Parent Ref\.No\.|Reference No\.) (.+)$', - r'<交易类型> with Ref. no. <参考号>', - text - ) - - text = re.sub(r'Sent GCash to <收款人名称> with account ending in (\d+)$', r'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>', text) - text = re.sub(r'^Transfer from \S+ to \S+$', r'Transfer from <付款人号码> to <收款人号码>', text) - # 模式 8: 从银行收款 - # 结构: Received GCash from <机构名> with account ending in <尾号> via <网络> or with invno:<...> - text = re.sub( - r'(?i)Received GCash from (.+?) with account ending in (\d+) (via .+|and invno:.+)$', - r'Received GCash from <付款人名称> with account ending in <银行4位数尾号> via <流水号>', - text - ) - - # 新增规则:Buy Load Transaction - text = re.sub( - r'^Buy Load Transaction for .+$', - r'Buy Load Transaction for <付款人号码>', - text - ) - - # 新增规则:Refund - text = re.sub( - r'^Refund from .+$', - r'Refund from <收款人名称>', - text - ) - - # 新增规则:统一处理所有JSON格式消息 - # 匹配各种action类型:Received money from, Sent money to, Received settlement from, Reversed settlement from等 + # 保留无法迁移的复杂规则:统一处理所有JSON格式消息(使用lambda) text = re.sub( r'\\\"(Received money from|Sent money to|Received settlement from|Reversed settlement from|Refunded money via|Sent money via)\\\",\\\"target\\\":\\\"(.+?)\\\"', lambda m: f'\\\"{m.group(1)}\\\",\\\"target\\\":\\\"<{get_placeholder(m.group(1))}>\\\"', text ) - return text + return text def template_to_regex(template): """ @@ -195,20 +165,21 @@ def template_to_regex(template): # 转义模板中的特殊字符,但保留占位符 escaped_template = re.escape(template) - # 将占位符映射到对应的正则表达式捕获组 - for placeholder, pattern in PLACEHOLDER_PATTERNS.items(): - escaped_placeholder = re.escape(placeholder) - # 替换占位符为对应的捕获组 - escaped_template = escaped_template.replace(escaped_placeholder, pattern) + # 将占位符替换为通用捕获组(因为不再依赖全局占位符映射) + # 如果需要更精确的子模式,请在 ABSTRACT_RULES 的 source_regex 中提供捕获组,后续会从中提取。 + default_pat = r'(.+?)' + placeholders = re.findall(r'<([^>]+)>', template) + for name in placeholders: + escaped_placeholder = re.escape(f'<{name}>') + escaped_template = escaped_template.replace(escaped_placeholder, default_pat, 1) return escaped_template def build_regex_from_template_data(template_data): """ - 根据模板的元信息(placeholders / group_patterns / source_regex)构建一个用于匹配的正则表达式字符串。 - 优先使用 template_data['source_regex'](如果存在),否则从 template_data['content'] + template_data['placeholders'] 生成带命名分组的正则。 - 返回未编译的正则字符串。 + 根据模板的元信息构建一个用于匹配的正则表达式字符串。 + 优先使用 template_data['source_regex'],否则从 template_data['content'] + template_data['placeholders'] 生成带命名分组的正则。 """ # 如果有原始正则,直接使用(保持原样) if 'source_regex' in template_data and template_data['source_regex']: @@ -230,28 +201,41 @@ def build_regex_from_template_data(template_data): return pat[1:-1] return pat + # placeholders_meta 可能是 [{'name':..., 'pattern':...}, ...] 或 ['name1','name2'] + # 使用一个计数器来处理重复的占位符名称 + placeholder_count = {} # placeholders_meta 可能是 [{'name':..., 'pattern':...}, ...] 或 ['name1','name2'] for ph in placeholders_meta: if isinstance(ph, dict): name = ph.get('name') - pattern = ph.get('pattern', '.+?') + pattern = ph.get('pattern') if ph.get('pattern') else '(.+?)' else: # 旧格式:仅名字 name = ph - # 尝试从全局映射中找到默认pattern - pattern = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)') + # 使用默认通配子模式 + pattern = '(.+?)' + + # 处理重复的占位符名称 + count = placeholder_count.get(name, 0) + placeholder_count[name] = count + 1 + + # 如果有重复,使用 name_1, name_2 等 + safe_name = re.sub(r"[^0-9A-Za-z_]", "_", name) + # 为了避免不同名称产生相同的safe_name,添加更多区分信息 + if len(safe_name) <= 5: # 如果转换后很短 + # 使用名称的字符信息来区分 + char_sum = sum(ord(c) for c in name) + safe_name = f"{safe_name}_{char_sum % 100}" + if count > 0: + safe_name = f"{safe_name}_{count}" # 清理 pattern 的外层括号以避免双重捕获 inner = strip_outer_parens(pattern) - # 命名捕获组 (保持组名为中文名,但 Python 的 (?P) 要求组名为字母数字和下划线) - # 将占位符名中的非字母数字替换为下划线以构造合法的组名 - safe_name = re.sub(r"[^0-9A-Za-z_]", "_", name) - named_group = f"(?P<{safe_name}>{inner})" escaped_placeholder = re.escape(f'<{name}>') - escaped_template = escaped_template.replace(escaped_placeholder, named_group) + escaped_template = escaped_template.replace(escaped_placeholder, named_group, 1) # 只替换第一个匹配 return escaped_template @@ -388,13 +372,14 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000 final_templates[tpl].append(content) # 注册元信息(占位符与其组正则) if tpl not in templates_meta: - phs = [] - for name, gp in zip(ar.get('placeholders', []), ar.get('group_patterns', [])): - phs.append({'name': name, 'pattern': gp}) - templates_meta[tpl] = { - 'placeholders': phs, - 'source_regex': ar['source_regex'] - } + phs = [] + for name in ar.get('placeholders', []): + # 优先把 source_regex 作为权威,不在此处重复具体子模式(用 None 标记) + phs.append({'name': name, 'pattern': None}) + templates_meta[tpl] = { + 'placeholders': phs, + 'source_regex': ar['source_regex'] + } matched_by_rule = True break except re.error: @@ -459,8 +444,8 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000 if isinstance(ph, dict): regex_map[ph.get('name')] = ph.get('pattern') else: - # ph 可能是名字字符串,尝试从全局映射取 pattern - regex_map[ph] = PLACEHOLDER_PATTERNS.get(f'<{ph}>', '(.+?)') + # ph 可能是名字字符串,使用通用后备模式 + regex_map[ph] = '(.+?)' # 优先保留 source_regex as special key if present if templates_meta[template].get('source_regex'): output_obj['source_regex'] = templates_meta[template]['source_regex'] @@ -468,7 +453,7 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000 # 没有元信息时,尝试根据模板内占位符名用默认映射构建 placeholders = re.findall(r'<([^>]+)>', template) for name in placeholders: - regex_map[name] = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)') + regex_map[name] = '(.+?)' output_obj['regex'] = regex_map @@ -583,12 +568,22 @@ def extract_values_with_templates(input_files, template_file, output_file, conte orig_name = ph_map.get(safe_name, safe_name) parameters[f'<{orig_name}>'] = val else: - # 否则使用位置组,借助模板中的占位符顺序 - values = match.groups() - placeholders = re.findall(r'<[^>]+>', tmeta.get('content', '')) - for i, placeholder in enumerate(placeholders): - if i < len(values): - parameters[placeholder] = values[i] + # 否则使用位置组,优先使用模板元信息中的占位符顺序(来自 templates_meta) + values = match.groups() + placeholders_meta = tmeta.get('placeholders') + if placeholders_meta and isinstance(placeholders_meta, list): + # placeholders_meta 可能是 [{'name':..., 'pattern':...}, ...] 或 ['name1','name2'] + for i, ph in enumerate(placeholders_meta): + if i >= len(values): + break + name = ph.get('name') if isinstance(ph, dict) else ph + parameters[f'<{name}>'] = values[i] + else: + # 退回到从模板字符串解析占位符(向后兼容旧格式) + placeholders = re.findall(r'<([^>]+)>', tmeta.get('content', '')) + for i, name in enumerate(placeholders): + if i < len(values): + parameters[f'<{name}>'] = values[i] else: # 退回老方法:通过模板字符串替换占位符生成正则