refactor(batch-import): 优化公司名称匹配算法
- 移除未使用的 HashSet 和 Set 导入 - 添加 patternLen 字段用于存储模式长度信息 - 修改 CompanyNameMatcher 构造函数以接收 patternLen 参数 - 在构建匹配器时收集并存储每个模式的长度 - 替换原有的 matchedIds 集合匹配逻辑 - 实现基于位置和长度的最优匹配选择算法 - 优先选择更长、更具体的匹配结果 - 处理相同位置不同长度的匹配冲突情况 - 改进模糊匹配的判断逻辑和性能表现
This commit is contained in:
@@ -14,13 +14,11 @@ import org.springframework.util.CollectionUtils;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.ArrayDeque;
|
import java.util.ArrayDeque;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.function.BiConsumer;
|
import java.util.function.BiConsumer;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
@@ -1049,10 +1047,12 @@ public class BatchImportSupport {
|
|||||||
|
|
||||||
private final List<Node> nodes;
|
private final List<Node> nodes;
|
||||||
private final int[] patternCompanyId; // 0 means ambiguous
|
private final int[] patternCompanyId; // 0 means ambiguous
|
||||||
|
private final int[] patternLen;
|
||||||
|
|
||||||
private CompanyNameMatcher(List<Node> nodes, int[] patternCompanyId) {
|
private CompanyNameMatcher(List<Node> nodes, int[] patternCompanyId, int[] patternLen) {
|
||||||
this.nodes = nodes;
|
this.nodes = nodes;
|
||||||
this.patternCompanyId = patternCompanyId;
|
this.patternCompanyId = patternCompanyId;
|
||||||
|
this.patternLen = patternLen;
|
||||||
}
|
}
|
||||||
|
|
||||||
static CompanyNameMatcher build(List<CreditCompany> companies) {
|
static CompanyNameMatcher build(List<CreditCompany> companies) {
|
||||||
@@ -1061,14 +1061,15 @@ public class BatchImportSupport {
|
|||||||
|
|
||||||
Map<String, Integer> patternIndex = new HashMap<>();
|
Map<String, Integer> patternIndex = new HashMap<>();
|
||||||
List<Integer> companyIds = new ArrayList<>();
|
List<Integer> companyIds = new ArrayList<>();
|
||||||
|
List<Integer> patternLens = new ArrayList<>();
|
||||||
|
|
||||||
if (!CollectionUtils.isEmpty(companies)) {
|
if (!CollectionUtils.isEmpty(companies)) {
|
||||||
for (CreditCompany c : companies) {
|
for (CreditCompany c : companies) {
|
||||||
if (c == null || c.getId() == null) {
|
if (c == null || c.getId() == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
addPattern(nodes, patternIndex, companyIds, normalizeCompanyName(c.getName()), c.getId());
|
addPattern(nodes, patternIndex, companyIds, patternLens, normalizeCompanyName(c.getName()), c.getId());
|
||||||
addPattern(nodes, patternIndex, companyIds, normalizeCompanyName(c.getMatchName()), c.getId());
|
addPattern(nodes, patternIndex, companyIds, patternLens, normalizeCompanyName(c.getMatchName()), c.getId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1076,14 +1077,19 @@ public class BatchImportSupport {
|
|||||||
for (int i = 0; i < companyIds.size(); i++) {
|
for (int i = 0; i < companyIds.size(); i++) {
|
||||||
patternCompanyId[i] = companyIds.get(i) != null ? companyIds.get(i) : 0;
|
patternCompanyId[i] = companyIds.get(i) != null ? companyIds.get(i) : 0;
|
||||||
}
|
}
|
||||||
|
int[] patternLen = new int[patternLens.size()];
|
||||||
|
for (int i = 0; i < patternLens.size(); i++) {
|
||||||
|
patternLen[i] = patternLens.get(i) != null ? patternLens.get(i) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
buildFailureLinks(nodes);
|
buildFailureLinks(nodes);
|
||||||
return new CompanyNameMatcher(nodes, patternCompanyId);
|
return new CompanyNameMatcher(nodes, patternCompanyId, patternLen);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void addPattern(List<Node> nodes,
|
private static void addPattern(List<Node> nodes,
|
||||||
Map<String, Integer> patternIndex,
|
Map<String, Integer> patternIndex,
|
||||||
List<Integer> companyIds,
|
List<Integer> companyIds,
|
||||||
|
List<Integer> patternLens,
|
||||||
String pattern,
|
String pattern,
|
||||||
Integer companyId) {
|
Integer companyId) {
|
||||||
if (pattern == null || companyId == null) {
|
if (pattern == null || companyId == null) {
|
||||||
@@ -1116,6 +1122,7 @@ public class BatchImportSupport {
|
|||||||
}
|
}
|
||||||
int idx = companyIds.size();
|
int idx = companyIds.size();
|
||||||
companyIds.add(companyId);
|
companyIds.add(companyId);
|
||||||
|
patternLens.add(pattern.length());
|
||||||
nodes.get(state).out.add(idx);
|
nodes.get(state).out.add(idx);
|
||||||
patternIndex.put(pattern, idx);
|
patternIndex.put(pattern, idx);
|
||||||
}
|
}
|
||||||
@@ -1167,7 +1174,9 @@ public class BatchImportSupport {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int state = 0;
|
int state = 0;
|
||||||
Set<Integer> matchedIds = new HashSet<>();
|
Integer bestCompanyId = null;
|
||||||
|
int bestStart = Integer.MAX_VALUE;
|
||||||
|
int bestLen = -1;
|
||||||
boolean ambiguous = false;
|
boolean ambiguous = false;
|
||||||
for (int i = 0; i < v.length(); i++) {
|
for (int i = 0; i < v.length(); i++) {
|
||||||
char ch = v.charAt(i);
|
char ch = v.charAt(i);
|
||||||
@@ -1186,19 +1195,39 @@ public class BatchImportSupport {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int cid = patternCompanyId[idx];
|
int cid = patternCompanyId[idx];
|
||||||
|
// Pattern exists but maps to multiple companies -> ignore this hit, keep looking for a unique one.
|
||||||
if (cid == 0) {
|
if (cid == 0) {
|
||||||
ambiguous = true;
|
continue;
|
||||||
} else {
|
}
|
||||||
matchedIds.add(cid);
|
int len = (idx < patternLen.length) ? patternLen[idx] : 0;
|
||||||
if (matchedIds.size() > 1) {
|
int start = len > 0 ? (i - len + 1) : i;
|
||||||
|
if (bestCompanyId == null) {
|
||||||
|
bestCompanyId = cid;
|
||||||
|
bestStart = start;
|
||||||
|
bestLen = len;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (start < bestStart) {
|
||||||
|
bestCompanyId = cid;
|
||||||
|
bestStart = start;
|
||||||
|
bestLen = len;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (start == bestStart) {
|
||||||
|
// Prefer the longer (more specific) match at the same position.
|
||||||
|
if (len > bestLen) {
|
||||||
|
bestCompanyId = cid;
|
||||||
|
bestLen = len;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Same position + same length but different companyId -> truly ambiguous.
|
||||||
|
if (len == bestLen && !bestCompanyId.equals(cid)) {
|
||||||
ambiguous = true;
|
ambiguous = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ambiguous) {
|
if (ambiguous) {
|
||||||
// Keep scanning to consume input, but we can early-exit for performance.
|
|
||||||
// For refresh use-case, ambiguous means we won't update this row.
|
|
||||||
// Still, continue is safe; break reduces CPU.
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1206,10 +1235,7 @@ public class BatchImportSupport {
|
|||||||
if (ambiguous) {
|
if (ambiguous) {
|
||||||
return new MatchResult(null, true);
|
return new MatchResult(null, true);
|
||||||
}
|
}
|
||||||
if (matchedIds.size() == 1) {
|
return new MatchResult(bestCompanyId, false);
|
||||||
return new MatchResult(matchedIds.iterator().next(), false);
|
|
||||||
}
|
|
||||||
return new MatchResult(null, false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user