update
This commit is contained in:
parent
6b5b05de17
commit
be83924799
205
sql_ml.py
205
sql_ml.py
@ -17,6 +17,18 @@ PREDEFINED_RULES = [
|
||||
# {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')}
|
||||
]
|
||||
|
||||
# 抽象化规则:将正则、模板和占位符/每组的正则一并声明
|
||||
ABSTRACT_RULES = [
|
||||
{
|
||||
# 示例:把原始正则和占位符、每组的正则显式化
|
||||
'source_regex': r'^Sent GCash to (.+?) with account ending in (\d+)$',
|
||||
'template': 'Sent GCash to <收款人名称> with account ending in <银行4位数尾号>',
|
||||
'placeholders': ['收款人名称', '银行4位数尾号'],
|
||||
'group_patterns': ['(.+?)', '(\\d+)']
|
||||
},
|
||||
# 你可以在这里继续添加更多抽象规则
|
||||
]
|
||||
|
||||
# 占位符到正则表达式的映射
|
||||
PLACEHOLDER_PATTERNS = {
|
||||
'<金额>': r'([\d,.]+)',
|
||||
@ -191,6 +203,58 @@ def template_to_regex(template):
|
||||
|
||||
return escaped_template
|
||||
|
||||
|
||||
def build_regex_from_template_data(template_data):
|
||||
"""
|
||||
根据模板的元信息(placeholders / group_patterns / source_regex)构建一个用于匹配的正则表达式字符串。
|
||||
优先使用 template_data['source_regex'](如果存在),否则从 template_data['content'] + template_data['placeholders'] 生成带命名分组的正则。
|
||||
返回未编译的正则字符串。
|
||||
"""
|
||||
# 如果有原始正则,直接使用(保持原样)
|
||||
if 'source_regex' in template_data and template_data['source_regex']:
|
||||
return template_data['source_regex']
|
||||
|
||||
template = template_data.get('content', '')
|
||||
placeholders_meta = template_data.get('placeholders')
|
||||
|
||||
# 如果没有占位符元信息,退回到旧方法生成正则
|
||||
if not placeholders_meta:
|
||||
return template_to_regex(template)
|
||||
|
||||
# 转义模板的普通字符,保留占位符位置
|
||||
escaped_template = re.escape(template)
|
||||
|
||||
def strip_outer_parens(pat: str) -> str:
|
||||
pat = pat.strip()
|
||||
if pat.startswith('(') and pat.endswith(')'):
|
||||
return pat[1:-1]
|
||||
return pat
|
||||
|
||||
# placeholders_meta 可能是 [{'name':..., 'pattern':...}, ...] 或 ['name1','name2']
|
||||
for ph in placeholders_meta:
|
||||
if isinstance(ph, dict):
|
||||
name = ph.get('name')
|
||||
pattern = ph.get('pattern', '.+?')
|
||||
else:
|
||||
# 旧格式:仅名字
|
||||
name = ph
|
||||
# 尝试从全局映射中找到默认pattern
|
||||
pattern = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)')
|
||||
|
||||
# 清理 pattern 的外层括号以避免双重捕获
|
||||
inner = strip_outer_parens(pattern)
|
||||
|
||||
# 命名捕获组 (保持组名为中文名,但 Python 的 (?P<name>) 要求组名为字母数字和下划线)
|
||||
# 将占位符名中的非字母数字替换为下划线以构造合法的组名
|
||||
safe_name = re.sub(r"[^0-9A-Za-z_]", "_", name)
|
||||
|
||||
named_group = f"(?P<{safe_name}>{inner})"
|
||||
|
||||
escaped_placeholder = re.escape(f'<{name}>')
|
||||
escaped_template = escaped_template.replace(escaped_placeholder, named_group)
|
||||
|
||||
return escaped_template
|
||||
|
||||
def extract_parameters(template, message):
|
||||
"""
|
||||
从消息中提取参数值
|
||||
@ -269,6 +333,8 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
||||
"""
|
||||
print("--- 开始迭代式模板提取 ---")
|
||||
final_templates = {} # template -> list of original contents
|
||||
# 存储模板的元信息(占位符与其正则、原始正则等)
|
||||
templates_meta = {} # template -> { 'placeholders': [ {name, pattern}, ... ], 'source_regex': str }
|
||||
unmatched_batch = []
|
||||
batch_num = 1
|
||||
|
||||
@ -310,6 +376,30 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
||||
final_templates[normalized_content].append(content)
|
||||
matched_by_rule = True
|
||||
break
|
||||
# 额外:检查抽象化规则(source_regex + template + placeholders)
|
||||
if not matched_by_rule:
|
||||
for ar in ABSTRACT_RULES:
|
||||
try:
|
||||
if re.match(ar['source_regex'], content):
|
||||
tpl = ar['template']
|
||||
if tpl not in final_templates:
|
||||
final_templates[tpl] = []
|
||||
if len(final_templates[tpl]) < 10:
|
||||
final_templates[tpl].append(content)
|
||||
# 注册元信息(占位符与其组正则)
|
||||
if tpl not in templates_meta:
|
||||
phs = []
|
||||
for name, gp in zip(ar.get('placeholders', []), ar.get('group_patterns', [])):
|
||||
phs.append({'name': name, 'pattern': gp})
|
||||
templates_meta[tpl] = {
|
||||
'placeholders': phs,
|
||||
'source_regex': ar['source_regex']
|
||||
}
|
||||
matched_by_rule = True
|
||||
break
|
||||
except re.error:
|
||||
# 如果抽象规则本身有错误,跳过
|
||||
continue
|
||||
|
||||
if matched_by_rule:
|
||||
continue
|
||||
@ -359,10 +449,33 @@ def extract_templates_iterative(input_files, output_file, rules, batch_size=1000
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
for template, data_list in sorted(final_templates.items()):
|
||||
if max_samples_per_template == 0:
|
||||
json.dump({"content": template}, f, ensure_ascii=False)
|
||||
# 新格式:{"template":..., "regex":{name: pattern, ...}, "data":[...]}
|
||||
output_obj = {'template': template}
|
||||
|
||||
# 构建 regex 映射:placeholder name -> pattern
|
||||
regex_map = {}
|
||||
if template in templates_meta:
|
||||
for ph in templates_meta[template].get('placeholders', []):
|
||||
if isinstance(ph, dict):
|
||||
regex_map[ph.get('name')] = ph.get('pattern')
|
||||
else:
|
||||
# ph 可能是名字字符串,尝试从全局映射取 pattern
|
||||
regex_map[ph] = PLACEHOLDER_PATTERNS.get(f'<{ph}>', '(.+?)')
|
||||
# 优先保留 source_regex as special key if present
|
||||
if templates_meta[template].get('source_regex'):
|
||||
output_obj['source_regex'] = templates_meta[template]['source_regex']
|
||||
else:
|
||||
json.dump({"content": template, "data": data_list[:max_samples_per_template]}, f, ensure_ascii=False)
|
||||
# 没有元信息时,尝试根据模板内占位符名用默认映射构建
|
||||
placeholders = re.findall(r'<([^>]+)>', template)
|
||||
for name in placeholders:
|
||||
regex_map[name] = PLACEHOLDER_PATTERNS.get(f'<{name}>', '(.+?)')
|
||||
|
||||
output_obj['regex'] = regex_map
|
||||
|
||||
# 示例数据
|
||||
output_obj['data'] = data_list[:max_samples_per_template] if max_samples_per_template != 0 else []
|
||||
|
||||
json.dump(output_obj, f, ensure_ascii=False)
|
||||
f.write('\n')
|
||||
|
||||
print(f"所有模板已成功写入到 '{output_file}'。")
|
||||
@ -378,14 +491,43 @@ def extract_values_with_templates(input_files, template_file, output_file, conte
|
||||
"""
|
||||
print("--- 开始使用模板提取参数值 ---")
|
||||
|
||||
# 读取模板
|
||||
templates = []
|
||||
# 读取模板及其元信息。支持两种格式:旧的 {content, placeholders, source_regex} 或 新的 {template, regex, source_regex, data}
|
||||
templates_meta = []
|
||||
with open(template_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
template_data = json.loads(line)
|
||||
templates.append(template_data['content'])
|
||||
try:
|
||||
raw = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
print(f"已加载 {len(templates)} 个模板")
|
||||
# 规范化到内部使用的 tmeta 格式:{ 'content': ..., 'placeholders': [ {name, pattern}, ... ], 'source_regex': ... }
|
||||
tmeta = {}
|
||||
if 'template' in raw:
|
||||
tmeta['content'] = raw.get('template')
|
||||
# raw may have 'regex' mapping name->pattern
|
||||
regex_map = raw.get('regex', {})
|
||||
phs = []
|
||||
for name, pat in regex_map.items():
|
||||
phs.append({'name': name, 'pattern': pat})
|
||||
# also consider raw['source_regex'] and raw['data'] if present
|
||||
if 'source_regex' in raw:
|
||||
tmeta['source_regex'] = raw.get('source_regex')
|
||||
if phs:
|
||||
tmeta['placeholders'] = phs
|
||||
elif 'content' in raw:
|
||||
# backward-compatible
|
||||
tmeta['content'] = raw.get('content')
|
||||
if 'placeholders' in raw:
|
||||
tmeta['placeholders'] = raw.get('placeholders')
|
||||
if 'source_regex' in raw:
|
||||
tmeta['source_regex'] = raw.get('source_regex')
|
||||
else:
|
||||
# skip unknown format
|
||||
continue
|
||||
|
||||
templates_meta.append(tmeta)
|
||||
|
||||
print(f"已加载 {len(templates_meta)} 个模板(含元信息/已规范化)")
|
||||
|
||||
# 从原始数据中提取值
|
||||
extracted_values = []
|
||||
@ -411,11 +553,52 @@ def extract_values_with_templates(input_files, template_file, output_file, conte
|
||||
continue
|
||||
|
||||
# 尝试匹配每个模板
|
||||
for template in templates:
|
||||
parameters = extract_parameters(template, content)
|
||||
for tmeta in templates_meta:
|
||||
# 构建用于匹配的正则
|
||||
try:
|
||||
regex_str = build_regex_from_template_data(tmeta)
|
||||
match = re.search(regex_str, content)
|
||||
except re.error:
|
||||
# 如果生成的正则有问题,退回到老方法
|
||||
match = None
|
||||
|
||||
parameters = {}
|
||||
if match:
|
||||
# 如果有命名组,优先使用命名组
|
||||
if match.groupdict():
|
||||
# 将安全组名还原为原始占位符名(如果有元信息)
|
||||
gd = match.groupdict()
|
||||
# 如果模板元信息里有占位符名映射,则把原名映射回来
|
||||
ph_map = {}
|
||||
if 'placeholders' in tmeta and isinstance(tmeta['placeholders'], list):
|
||||
for ph in tmeta['placeholders']:
|
||||
if isinstance(ph, dict):
|
||||
orig = ph.get('name')
|
||||
else:
|
||||
orig = ph
|
||||
safe = re.sub(r"[^0-9A-Za-z_]", "_", orig)
|
||||
ph_map[safe] = orig
|
||||
|
||||
for safe_name, val in gd.items():
|
||||
orig_name = ph_map.get(safe_name, safe_name)
|
||||
parameters[f'<{orig_name}>'] = val
|
||||
else:
|
||||
# 否则使用位置组,借助模板中的占位符顺序
|
||||
values = match.groups()
|
||||
placeholders = re.findall(r'<[^>]+>', tmeta.get('content', ''))
|
||||
for i, placeholder in enumerate(placeholders):
|
||||
if i < len(values):
|
||||
parameters[placeholder] = values[i]
|
||||
|
||||
else:
|
||||
# 退回老方法:通过模板字符串替换占位符生成正则
|
||||
tpl = tmeta.get('content')
|
||||
if tpl:
|
||||
parameters = extract_parameters(tpl, content)
|
||||
|
||||
if parameters:
|
||||
extracted_values.append({
|
||||
'template': template,
|
||||
'template': tmeta.get('content'),
|
||||
'message': content,
|
||||
'parameters': parameters
|
||||
})
|
||||
|
Loading…
x
Reference in New Issue
Block a user