save
This commit is contained in:
parent
3d6e2d3ad7
commit
6267458d07
152
sql_ml.py
152
sql_ml.py
@ -5,6 +5,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||||||
from sklearn.cluster import DBSCAN
|
from sklearn.cluster import DBSCAN
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
import argparse
|
||||||
|
|
||||||
# --- 规则和辅助函数 (与之前相同) ---
|
# --- 规则和辅助函数 (与之前相同) ---
|
||||||
PREDEFINED_RULES = [
|
PREDEFINED_RULES = [
|
||||||
@ -16,6 +17,25 @@ PREDEFINED_RULES = [
|
|||||||
# {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')}
|
# {'name': 'Sent GCash to Account', 'pattern': re.compile(r'^Sent GCash to .+? with account ending in \d+$')}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 占位符到正则表达式的映射
|
||||||
|
PLACEHOLDER_PATTERNS = {
|
||||||
|
'<金额>': r'([\d,.]+)',
|
||||||
|
'<付款人名称>': r'(.+?)',
|
||||||
|
'<收款人名称>': r'(.+?)',
|
||||||
|
'<付款人号码>': r'([\d\w\+\-\(\)]+)',
|
||||||
|
'<收款人号码>': r'([\d\w\+\-\(\)]+)',
|
||||||
|
'<银行4位数尾号>': r'(\d{4})',
|
||||||
|
'<参考号>': r'(.+?)',
|
||||||
|
'<交易单号>': r'(.+?)',
|
||||||
|
'<日期时间>': r'(.+?)',
|
||||||
|
'<日期>': r'(\d{2}-\d{2}-\d{4})',
|
||||||
|
'<时间>': r'(\d{1,2}:\d{2}\s[AP]M)',
|
||||||
|
'<消息>': r'(.+?)',
|
||||||
|
'<流水号>': r'(.+?)',
|
||||||
|
'<网络或发票号>': r'(.+?)',
|
||||||
|
'<交易类型>': r'(.+?)',
|
||||||
|
}
|
||||||
|
|
||||||
def normalize_text(text):
|
def normalize_text(text):
|
||||||
# 模式 8: 从银行收款 (这条规则必须先运行)
|
# 模式 8: 从银行收款 (这条规则必须先运行)
|
||||||
text = re.sub(
|
text = re.sub(
|
||||||
@ -135,6 +155,48 @@ def normalize_text(text):
|
|||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def template_to_regex(template):
|
||||||
|
"""
|
||||||
|
将模板转换为可用于提取参数的正则表达式
|
||||||
|
"""
|
||||||
|
# 转义模板中的特殊字符,但保留占位符
|
||||||
|
escaped_template = re.escape(template)
|
||||||
|
|
||||||
|
# 将占位符映射到对应的正则表达式捕获组
|
||||||
|
for placeholder, pattern in PLACEHOLDER_PATTERNS.items():
|
||||||
|
escaped_placeholder = re.escape(placeholder)
|
||||||
|
# 替换占位符为对应的捕获组
|
||||||
|
escaped_template = escaped_template.replace(escaped_placeholder, pattern)
|
||||||
|
|
||||||
|
return escaped_template
|
||||||
|
|
||||||
|
def extract_parameters(template, message):
|
||||||
|
"""
|
||||||
|
从消息中提取参数值
|
||||||
|
"""
|
||||||
|
# 生成正则表达式
|
||||||
|
pattern = template_to_regex(template)
|
||||||
|
|
||||||
|
# 匹配消息
|
||||||
|
match = re.search(pattern, message)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
# 获取所有捕获组
|
||||||
|
values = match.groups()
|
||||||
|
|
||||||
|
# 获取模板中的占位符
|
||||||
|
placeholders = re.findall(r'<[^>]+>', template)
|
||||||
|
|
||||||
|
# 创建参数字典
|
||||||
|
parameters = {}
|
||||||
|
for i, placeholder in enumerate(placeholders):
|
||||||
|
if i < len(values):
|
||||||
|
parameters[placeholder] = values[i]
|
||||||
|
|
||||||
|
return parameters
|
||||||
|
|
||||||
|
return {}
|
||||||
|
|
||||||
def run_dbscan_on_corpus(corpus, eps, min_samples, max_samples=10):
|
def run_dbscan_on_corpus(corpus, eps, min_samples, max_samples=10):
|
||||||
if not corpus: return {}
|
if not corpus: return {}
|
||||||
|
|
||||||
@ -278,18 +340,90 @@ def extract_templates_iterative(input_file, output_file, rules, batch_size=1000,
|
|||||||
print(f"错误:找不到输入文件 '{input_file}'。")
|
print(f"错误:找不到输入文件 '{input_file}'。")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def extract_values_with_templates(input_file, template_file, output_file):
|
||||||
|
"""
|
||||||
|
使用DBSCAN生成的模板从原始消息中提取参数值
|
||||||
|
"""
|
||||||
|
print("--- 开始使用模板提取参数值 ---")
|
||||||
|
|
||||||
|
# 读取模板
|
||||||
|
templates = []
|
||||||
|
with open(template_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
template_data = json.loads(line)
|
||||||
|
templates.append(template_data['content'])
|
||||||
|
|
||||||
|
print(f"已加载 {len(templates)} 个模板")
|
||||||
|
|
||||||
|
# 从原始数据中提取值
|
||||||
|
extracted_values = []
|
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
total_lines = sum(1 for _ in f)
|
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
|
for line in tqdm(f, total=total_lines, desc="提取参数"):
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
content = data.get('content', '')
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 尝试匹配每个模板
|
||||||
|
for template in templates:
|
||||||
|
parameters = extract_parameters(template, content)
|
||||||
|
if parameters:
|
||||||
|
extracted_values.append({
|
||||||
|
'template': template,
|
||||||
|
'message': content,
|
||||||
|
'parameters': parameters
|
||||||
|
})
|
||||||
|
# 找到匹配就跳出循环
|
||||||
|
break
|
||||||
|
|
||||||
|
except (json.JSONDecodeError, Exception):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 保存提取的值
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
for item in extracted_values:
|
||||||
|
json.dump(item, f, ensure_ascii=False)
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
|
print(f"成功从 {len(extracted_values)} 条消息中提取参数,并保存到 '{output_file}'")
|
||||||
|
|
||||||
# --- 使用示例 ---
|
# --- 使用示例 ---
|
||||||
# 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl'
|
# 假设您已经运行了上一个脚本,生成了 'content_filtered.jsonl'
|
||||||
input_jsonl_file = 'content_filtered.jsonl'
|
input_jsonl_file = 'content_filtered.jsonl'
|
||||||
output_template_file = 'templates_iterative.txt'
|
output_template_file = 'templates_iterative.txt'
|
||||||
BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整
|
BATCH_PROCESSING_SIZE = 10000 # 可以根据你的内存和数据量调整
|
||||||
|
|
||||||
extract_templates_iterative(
|
if __name__ == "__main__":
|
||||||
input_file=input_jsonl_file,
|
parser = argparse.ArgumentParser(description='Extract templates from GCash transaction data.')
|
||||||
output_file=output_template_file,
|
parser.add_argument('--input_file', type=str, default=input_jsonl_file, help='Input JSONL file path')
|
||||||
rules=PREDEFINED_RULES,
|
parser.add_argument('--output_file', type=str, default=output_template_file, help='Output template file path')
|
||||||
batch_size=BATCH_PROCESSING_SIZE,
|
parser.add_argument('--batch_size', type=int, default=BATCH_PROCESSING_SIZE, help='Batch processing size (data volume)')
|
||||||
eps=0.4,
|
parser.add_argument('--eps', type=float, default=0.4, help='DBSCAN eps parameter')
|
||||||
min_samples=5, # 稍微提高min_samples可以得到更可靠的模板
|
parser.add_argument('--min_samples', type=int, default=5, help='DBSCAN min_samples parameter')
|
||||||
max_samples_per_template=10 # 设置为正数以导出样本数据,0表示不导出
|
parser.add_argument('--extract_values', action='store_true', help='Extract values using generated templates')
|
||||||
)
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.extract_values:
|
||||||
|
# 执行参数提取
|
||||||
|
extract_values_with_templates(
|
||||||
|
input_file=args.input_file,
|
||||||
|
template_file=args.output_file,
|
||||||
|
output_file='extracted_parameters.jsonl'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# 执行模板提取
|
||||||
|
extract_templates_iterative(
|
||||||
|
input_file=args.input_file,
|
||||||
|
output_file=args.output_file,
|
||||||
|
rules=PREDEFINED_RULES,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
eps=args.eps,
|
||||||
|
min_samples=args.min_samples
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user