新增excel转markdown工具类

This commit is contained in:
2026-03-05 10:41:11 +08:00
parent 0d91d1fcd7
commit fa1430030b

View File

@@ -0,0 +1,258 @@
package com.gxwebsoft.ai.utils;
import org.apache.poi.ss.usermodel.*;
import org.springframework.web.multipart.MultipartFile;
import java.io.*;
import java.util.*;
import java.util.stream.Collectors;
/**
* Excel 转 Markdown 工具类Apache POI 4+ / 5+
* 支持 MultipartFile 直接转换,不落地生成 Markdown 内容
*/
public class ExcelToMarkdownUtils {
private ExcelToMarkdownUtils() {}
/**
* 将 MultipartFile 中的 Excel 所有工作表转换为 Markdown 内容
* 适用于 Spring Web 应用,由框架注入 MultipartFile 参数
*
* @param file 上传的 Excel 文件MultipartFile
* @return Map key = "原始文件名-工作表名"value = 对应工作表的 Markdown 字符串
*/
public static Map<String, String> convertToMarkdown(MultipartFile file) throws IOException {
return convertToMarkdown(file.getInputStream(), file.getOriginalFilename());
}
public static Map<String, String> convertToMarkdown(InputStream inputStream, String originalFilename) throws IOException {
if (originalFilename == null) {
originalFilename = "unknown";
} else {
// 去除扩展名
int dotIndex = originalFilename.lastIndexOf('.');
if (dotIndex > 0) {
originalFilename = originalFilename.substring(0, dotIndex);
}
}
Map<String, String> result = new LinkedHashMap<>();
try (Workbook workbook = WorkbookFactory.create(inputStream)) {
DataFormatter formatter = new DataFormatter();
int sheetCount = workbook.getNumberOfSheets();
for (int i = 0; i < sheetCount; i++) {
Sheet sheet = workbook.getSheetAt(i);
String sheetName = sheet.getSheetName();
String key = originalFilename + "-" + sheetName;
String markdown = sheetToMarkdown(sheet, formatter);
result.put(key, markdown);
}
}
return result;
}
/**
* 将单个工作表转换为 Markdown 格式字符串
*/
private static String sheetToMarkdown(Sheet sheet, DataFormatter formatter) {
StringBuilder md = new StringBuilder();
boolean isHeader = true;
for (Row row : sheet) {
if (row == null || row.getPhysicalNumberOfCells() == 0) continue;
StringBuilder mdLine = new StringBuilder("|");
int lastCellNum = row.getLastCellNum();
for (int i = 0; i < lastCellNum; i++) {
Cell cell = row.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
String cellValue = getCellDisplayValue(cell, formatter)
.replace("\n", " ")
.replace("\r", "");
mdLine.append(" ").append(cellValue).append(" |");
}
md.append(mdLine).append("\n");
if (isHeader) {
StringBuilder separator = new StringBuilder("|");
for (int i = 0; i < lastCellNum; i++) {
separator.append(" --- |");
}
md.append(separator).append("\n");
isHeader = false;
}
}
// return md.toString();
return cleanMarkdownTable(md.toString());
}
/**
* 获取单元格显示的文本(对公式单元格强制取缓存结果)
*/
private static String getCellDisplayValue(Cell cell, DataFormatter formatter) {
if (cell == null) return "";
if (cell.getCellType() != CellType.FORMULA) {
return formatter.formatCellValue(cell);
}
CellType cachedType = cell.getCachedFormulaResultType();
switch (cachedType) {
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
return formatter.formatCellValue(cell);
} else {
double numericValue = cell.getNumericCellValue();
short formatIndex = cell.getCellStyle().getDataFormat();
String formatString = cell.getCellStyle().getDataFormatString();
return formatter.formatRawCellContents(numericValue, formatIndex, formatString);
}
case STRING:
return cell.getStringCellValue();
case BOOLEAN:
return String.valueOf(cell.getBooleanCellValue());
case ERROR:
byte errorCode = cell.getErrorCellValue();
return org.apache.poi.ss.usermodel.FormulaError.forInt(errorCode).getString();
default:
return "";
}
}
/**
* 清理无效单元格(只删除最右侧连续空列)
*/
private static String cleanMarkdownTable(String markdown) {
if (markdown == null || markdown.trim().isEmpty()) {
return markdown;
}
// 1 按行拆分 + 单元格 trim
List<String[]> rows = Arrays.stream(markdown.split("\n"))
.map(line -> Arrays.stream(line.split("\\|", -1))
.map(String::trim)
.toArray(String[]::new))
.collect(Collectors.toList());
if (rows.isEmpty()) {
return markdown;
}
// 2 删除整行为空的行
rows = rows.stream()
.filter(row -> Arrays.stream(row)
.anyMatch(cell -> !cell.isEmpty() && !cell.matches("-+")))
.collect(Collectors.toList());
if (rows.isEmpty()) {
return "";
}
int columnCount = rows.stream()
.mapToInt(r -> r.length)
.max()
.orElse(0);
// 3 判断每一列是否全空
boolean[] emptyColumns = new boolean[columnCount];
Arrays.fill(emptyColumns, true);
for (String[] row : rows) {
for (int i = 0; i < row.length; i++) {
if (!row[i].isEmpty() && !row[i].matches("-+")) {
emptyColumns[i] = false;
}
}
}
// 4 从右往左找最后一个非空列(关键步骤)
int lastNonEmptyColumn = columnCount - 1;
while (lastNonEmptyColumn >= 0 && emptyColumns[lastNonEmptyColumn]) {
lastNonEmptyColumn--;
}
if (lastNonEmptyColumn < 0) {
return "";
}
// 5 重建表格,只保留 0 ~ lastNonEmptyColumn
StringBuilder result = new StringBuilder();
for (String[] row : rows) {
result.append("|");
for (int i = 0; i <= lastNonEmptyColumn && i < row.length; i++) {
result.append(" ").append(row[i]).append(" |");
}
result.append("\n");
}
return result.toString();
}
// 解析后,入知识库时,每行添加行号(序号)"第N行",帮助大模型理解
// 这个是个excel的markdown格式除了标题表头帮我判断有效内容从第几行开始给我返回数字特别注意不要数错了
// 第1行| 广西千汇食品有限公司营业月报表 | | | | | | | | | | | | | | | | | | |
// 第2行| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
// 第3行| | | | | | | | | | | | | | | | | | | |
// 第4行| 日期 | 销售合计 | | | | | | | | | | | | | | | | | |
// 第5行| | 代宰头数 | 毛边白条 | 边猪 | 合计头数 | 毛边重量 | 边猪重量 | 重量合计 | 毛边单价 元/公斤 | 边猪单价元/公斤 | 毛边金额 | 边猪金额 | 白条猪金额合计(元) | 猪杂 | 苦胆 | 代宰费 | 副产 | 猪血 | 应收金额 |
// 第6行| 2023"年"4"月" | 3705 | 2405 | 13296 | 19406 | 243814.00 | 1452236.90 | 1696050.90 | 18.76 | 19.15 | 4573219.00 | 27807875.00 | 32,381,094.00 | 90,678.50 | 7,724.00 | 296,400.00 | 15,120.00 | 3,000.00 | 32,794,016.50 |
// 第7行| 2023"年"5"月" | 3826 | 3760 | 14149 | 21735 | 372297.60 | 1524253.60 | 1896551.20 | 18.65 | 18.65 | 6943757.00 | 28427095.00 | 35,370,852.00 | 98,076.00 | 10,639.50 | 306,080.00 | 61,360.00 | 19,561.50 | 35,866,569.00 |
// 第8行| 2023"年"6"月" | 1534 | 4593 | 13032 | 19159 | 464744.30 | 1405535.90 | 1870280.20 | 18.65 | 18.79 | 8667447.00 | 26413731.00 | 35,081,178.00 | 95,529.50 | 13,418.50 | 122,720.00 | 21,120.00 | 17,243.10 | 35,351,209.10 |
// 第9行| 2023"年"7"月" | 950 | 4415 | 13196 | 18561 | 448880.45 | 1378896.20 | 1827776.65 | 18.90 | 19.02 | 8484251.00 | 26224348.00 | 34,708,599.00 | 95,029.00 | 12,431.50 | 76,000.00 | 48,540.00 | 16,704.90 | 34,957,304.40 |
// 第10行| 2023"年"8"月" | 781 | 3591 | 11598 | 15970 | 362697.70 | 1215945.90 | 1578643.60 | 22.61 | 22.76 | 8200693.00 | 27673575.00 | 35,874,268.00 | 70,303.00 | 10,737.00 | 62,480.00 | - 0 | 14,373.00 | 36,032,161.00 |
// 请提取以下Excel表格的【列名/表头】:
//
// [表格内容]
//
// 要求:
// 1. 只提取作为数据列标识的【列名】(不是表格的大标题)
// 2. 如果有合并单元格或多层表头,请注意区分
// 3. 如果第一行是合并的大标题,请跳过,从真正的列名行开始提取
// 4. 保持列名的原始顺序和格式
public static void main(String[] args) throws Exception {
String excelPath = "D:\\Word\\桌面管理\\20251201\\SJ\\广西千汇食品有限公司-材料\\广西千汇食品有限公司-材料\\00 任期审计材料-韦锦流整理9.15\\13.财务数据\\2023.4-2025.6年营业报.xlsx";
String outputDirPath = "D:\\Word\\桌面管理\\20251201\\SJ\\markdown_output";
File file = new File(excelPath);
File outputDir = new File(outputDirPath);
if (!outputDir.exists() && !outputDir.mkdirs()) {
throw new RuntimeException("无法创建输出目录");
}
try (FileInputStream fis = new FileInputStream(file)) {
Map<String, String> result = ExcelToMarkdownUtils.convertToMarkdown(fis, file.getName());
for (Map.Entry<String, String> entry : result.entrySet()) {
String fileName = entry.getKey();
String markdown = entry.getValue();
// 清理非法文件名字符
fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");
File mdFile = new File(outputDir, fileName + ".md");
try (FileWriter writer = new FileWriter(mdFile)) {
writer.write(markdown);
}
System.out.println("已生成: " + mdFile.getAbsolutePath());
}
}
System.out.println("✅ 全部转换完成");
}
}