refactor(batch-import): 优化公司名称匹配算法

- 移除未使用的 HashSet 和 Set 导入
- 添加 patternLen 字段用于存储模式长度信息
- 修改 CompanyNameMatcher 构造函数以接收 patternLen 参数
- 在构建匹配器时收集并存储每个模式的长度
- 替换原有的 matchedIds 集合匹配逻辑
- 实现基于位置和长度的最优匹配选择算法
- 优先选择更长、更具体的匹配结果
- 处理相同位置不同长度的匹配冲突情况
- 改进模糊匹配的判断逻辑和性能表现
This commit is contained in:
2026-02-06 18:27:13 +08:00
parent e7133f65c9
commit 6b401b8286

View File

@@ -14,13 +14,11 @@ import org.springframework.util.CollectionUtils;
import java.util.ArrayList;
import java.util.ArrayDeque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BiFunction;
import java.util.function.Consumer;
@@ -1049,10 +1047,12 @@ public class BatchImportSupport {
private final List<Node> nodes;
private final int[] patternCompanyId; // 0 means ambiguous
private final int[] patternLen;
private CompanyNameMatcher(List<Node> nodes, int[] patternCompanyId) {
private CompanyNameMatcher(List<Node> nodes, int[] patternCompanyId, int[] patternLen) {
this.nodes = nodes;
this.patternCompanyId = patternCompanyId;
this.patternLen = patternLen;
}
static CompanyNameMatcher build(List<CreditCompany> companies) {
@@ -1061,14 +1061,15 @@ public class BatchImportSupport {
Map<String, Integer> patternIndex = new HashMap<>();
List<Integer> companyIds = new ArrayList<>();
List<Integer> patternLens = new ArrayList<>();
if (!CollectionUtils.isEmpty(companies)) {
for (CreditCompany c : companies) {
if (c == null || c.getId() == null) {
continue;
}
addPattern(nodes, patternIndex, companyIds, normalizeCompanyName(c.getName()), c.getId());
addPattern(nodes, patternIndex, companyIds, normalizeCompanyName(c.getMatchName()), c.getId());
addPattern(nodes, patternIndex, companyIds, patternLens, normalizeCompanyName(c.getName()), c.getId());
addPattern(nodes, patternIndex, companyIds, patternLens, normalizeCompanyName(c.getMatchName()), c.getId());
}
}
@@ -1076,14 +1077,19 @@ public class BatchImportSupport {
for (int i = 0; i < companyIds.size(); i++) {
patternCompanyId[i] = companyIds.get(i) != null ? companyIds.get(i) : 0;
}
int[] patternLen = new int[patternLens.size()];
for (int i = 0; i < patternLens.size(); i++) {
patternLen[i] = patternLens.get(i) != null ? patternLens.get(i) : 0;
}
buildFailureLinks(nodes);
return new CompanyNameMatcher(nodes, patternCompanyId);
return new CompanyNameMatcher(nodes, patternCompanyId, patternLen);
}
private static void addPattern(List<Node> nodes,
Map<String, Integer> patternIndex,
List<Integer> companyIds,
List<Integer> patternLens,
String pattern,
Integer companyId) {
if (pattern == null || companyId == null) {
@@ -1116,6 +1122,7 @@ public class BatchImportSupport {
}
int idx = companyIds.size();
companyIds.add(companyId);
patternLens.add(pattern.length());
nodes.get(state).out.add(idx);
patternIndex.put(pattern, idx);
}
@@ -1167,7 +1174,9 @@ public class BatchImportSupport {
}
int state = 0;
Set<Integer> matchedIds = new HashSet<>();
Integer bestCompanyId = null;
int bestStart = Integer.MAX_VALUE;
int bestLen = -1;
boolean ambiguous = false;
for (int i = 0; i < v.length(); i++) {
char ch = v.charAt(i);
@@ -1186,19 +1195,39 @@ public class BatchImportSupport {
continue;
}
int cid = patternCompanyId[idx];
// Pattern exists but maps to multiple companies -> ignore this hit, keep looking for a unique one.
if (cid == 0) {
ambiguous = true;
} else {
matchedIds.add(cid);
if (matchedIds.size() > 1) {
continue;
}
int len = (idx < patternLen.length) ? patternLen[idx] : 0;
int start = len > 0 ? (i - len + 1) : i;
if (bestCompanyId == null) {
bestCompanyId = cid;
bestStart = start;
bestLen = len;
continue;
}
if (start < bestStart) {
bestCompanyId = cid;
bestStart = start;
bestLen = len;
continue;
}
if (start == bestStart) {
// Prefer the longer (more specific) match at the same position.
if (len > bestLen) {
bestCompanyId = cid;
bestLen = len;
continue;
}
// Same position + same length but different companyId -> truly ambiguous.
if (len == bestLen && !bestCompanyId.equals(cid)) {
ambiguous = true;
break;
}
}
}
if (ambiguous) {
// Keep scanning to consume input, but we can early-exit for performance.
// For refresh use-case, ambiguous means we won't update this row.
// Still, continue is safe; break reduces CPU.
break;
}
}
@@ -1206,10 +1235,7 @@ public class BatchImportSupport {
if (ambiguous) {
return new MatchResult(null, true);
}
if (matchedIds.size() == 1) {
return new MatchResult(matchedIds.iterator().next(), false);
}
return new MatchResult(null, false);
return new MatchResult(bestCompanyId, false);
}
}
}