From fa1430030b5249a221197789a785db963142ec7d Mon Sep 17 00:00:00 2001 From: yuance <182865460@qq.com> Date: Thu, 5 Mar 2026 10:41:11 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Eexcel=E8=BD=ACmarkdown?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ai/utils/ExcelToMarkdownUtils.java | 258 ++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java diff --git a/src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java b/src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java new file mode 100644 index 0000000..f453056 --- /dev/null +++ b/src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java @@ -0,0 +1,258 @@ +package com.gxwebsoft.ai.utils; + +import org.apache.poi.ss.usermodel.*; +import org.springframework.web.multipart.MultipartFile; + +import java.io.*; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Excel 转 Markdown 工具类(Apache POI 4+ / 5+) + * 支持 MultipartFile 直接转换,不落地生成 Markdown 内容 + */ +public class ExcelToMarkdownUtils { + + private ExcelToMarkdownUtils() {} + + /** + * 将 MultipartFile 中的 Excel 所有工作表转换为 Markdown 内容 + * 适用于 Spring Web 应用,由框架注入 MultipartFile 参数 + * + * @param file 上传的 Excel 文件(MultipartFile) + * @return Map key = "原始文件名-工作表名",value = 对应工作表的 Markdown 字符串 + */ + public static Map convertToMarkdown(MultipartFile file) throws IOException { + return convertToMarkdown(file.getInputStream(), file.getOriginalFilename()); + } + + public static Map convertToMarkdown(InputStream inputStream, String originalFilename) throws IOException { + if (originalFilename == null) { + originalFilename = "unknown"; + } else { + // 去除扩展名 + int dotIndex = originalFilename.lastIndexOf('.'); + if (dotIndex > 0) { + originalFilename = originalFilename.substring(0, dotIndex); + } + } + + Map result = new LinkedHashMap<>(); + + try (Workbook workbook = WorkbookFactory.create(inputStream)) { + DataFormatter formatter = new DataFormatter(); + int sheetCount = workbook.getNumberOfSheets(); + + for (int i = 0; i < sheetCount; i++) { + Sheet sheet = workbook.getSheetAt(i); + String sheetName = sheet.getSheetName(); + String key = originalFilename + "-" + sheetName; + String markdown = sheetToMarkdown(sheet, formatter); + result.put(key, markdown); + } + } + + return result; + } + + /** + * 将单个工作表转换为 Markdown 格式字符串 + */ + private static String sheetToMarkdown(Sheet sheet, DataFormatter formatter) { + StringBuilder md = new StringBuilder(); + boolean isHeader = true; + + for (Row row : sheet) { + if (row == null || row.getPhysicalNumberOfCells() == 0) continue; + + StringBuilder mdLine = new StringBuilder("|"); + int lastCellNum = row.getLastCellNum(); + for (int i = 0; i < lastCellNum; i++) { + Cell cell = row.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK); + String cellValue = getCellDisplayValue(cell, formatter) + .replace("\n", " ") + .replace("\r", ""); + mdLine.append(" ").append(cellValue).append(" |"); + } + md.append(mdLine).append("\n"); + + if (isHeader) { + StringBuilder separator = new StringBuilder("|"); + for (int i = 0; i < lastCellNum; i++) { + separator.append(" --- |"); + } + md.append(separator).append("\n"); + isHeader = false; + } + } +// return md.toString(); + return cleanMarkdownTable(md.toString()); + } + + /** + * 获取单元格显示的文本(对公式单元格强制取缓存结果) + */ + private static String getCellDisplayValue(Cell cell, DataFormatter formatter) { + if (cell == null) return ""; + + if (cell.getCellType() != CellType.FORMULA) { + return formatter.formatCellValue(cell); + } + + CellType cachedType = cell.getCachedFormulaResultType(); + switch (cachedType) { + case NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + return formatter.formatCellValue(cell); + } else { + double numericValue = cell.getNumericCellValue(); + short formatIndex = cell.getCellStyle().getDataFormat(); + String formatString = cell.getCellStyle().getDataFormatString(); + return formatter.formatRawCellContents(numericValue, formatIndex, formatString); + } + case STRING: + return cell.getStringCellValue(); + case BOOLEAN: + return String.valueOf(cell.getBooleanCellValue()); + case ERROR: + byte errorCode = cell.getErrorCellValue(); + return org.apache.poi.ss.usermodel.FormulaError.forInt(errorCode).getString(); + default: + return ""; + } + } + + /** + * 清理无效单元格(只删除最右侧连续空列) + */ + private static String cleanMarkdownTable(String markdown) { + + if (markdown == null || markdown.trim().isEmpty()) { + return markdown; + } + + // 1 按行拆分 + 单元格 trim + List rows = Arrays.stream(markdown.split("\n")) + .map(line -> Arrays.stream(line.split("\\|", -1)) + .map(String::trim) + .toArray(String[]::new)) + .collect(Collectors.toList()); + + if (rows.isEmpty()) { + return markdown; + } + + // 2 删除整行为空的行 + rows = rows.stream() + .filter(row -> Arrays.stream(row) + .anyMatch(cell -> !cell.isEmpty() && !cell.matches("-+"))) + .collect(Collectors.toList()); + + if (rows.isEmpty()) { + return ""; + } + + int columnCount = rows.stream() + .mapToInt(r -> r.length) + .max() + .orElse(0); + + // 3 判断每一列是否全空 + boolean[] emptyColumns = new boolean[columnCount]; + Arrays.fill(emptyColumns, true); + + for (String[] row : rows) { + for (int i = 0; i < row.length; i++) { + if (!row[i].isEmpty() && !row[i].matches("-+")) { + emptyColumns[i] = false; + } + } + } + + // 4 从右往左找最后一个非空列(关键步骤) + int lastNonEmptyColumn = columnCount - 1; + while (lastNonEmptyColumn >= 0 && emptyColumns[lastNonEmptyColumn]) { + lastNonEmptyColumn--; + } + + if (lastNonEmptyColumn < 0) { + return ""; + } + + // 5 重建表格,只保留 0 ~ lastNonEmptyColumn + StringBuilder result = new StringBuilder(); + + for (String[] row : rows) { + result.append("|"); + + for (int i = 0; i <= lastNonEmptyColumn && i < row.length; i++) { + result.append(" ").append(row[i]).append(" |"); + } + + result.append("\n"); + } + + return result.toString(); + } + +// 解析后,入知识库时,每行添加行号(序号)"第N行",帮助大模型理解 +// 这个是个excel的markdown格式,除了标题表头,帮我判断有效内容从第几行开始,给我返回数字,特别注意不要数错了 +// 第1行:| 广西千汇食品有限公司营业月报表 | | | | | | | | | | | | | | | | | | | +// 第2行:| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +// 第3行:| | | | | | | | | | | | | | | | | | | | +// 第4行:| 日期 | 销售合计 | | | | | | | | | | | | | | | | | | +// 第5行:| | 代宰头数 | 毛边白条 | 边猪 | 合计头数 | 毛边重量 | 边猪重量 | 重量合计 | 毛边单价 元/公斤 | 边猪单价元/公斤 | 毛边金额 | 边猪金额 | 白条猪金额合计(元) | 猪杂 | 苦胆 | 代宰费 | 副产 | 猪血 | 应收金额 | +// 第6行:| 2023"年"4"月" | 3705 | 2405 | 13296 | 19406 | 243814.00 | 1452236.90 | 1696050.90 | 18.76 | 19.15 | 4573219.00 | 27807875.00 | 32,381,094.00 | 90,678.50 | 7,724.00 | 296,400.00 | 15,120.00 | 3,000.00 | 32,794,016.50 | +// 第7行:| 2023"年"5"月" | 3826 | 3760 | 14149 | 21735 | 372297.60 | 1524253.60 | 1896551.20 | 18.65 | 18.65 | 6943757.00 | 28427095.00 | 35,370,852.00 | 98,076.00 | 10,639.50 | 306,080.00 | 61,360.00 | 19,561.50 | 35,866,569.00 | +// 第8行:| 2023"年"6"月" | 1534 | 4593 | 13032 | 19159 | 464744.30 | 1405535.90 | 1870280.20 | 18.65 | 18.79 | 8667447.00 | 26413731.00 | 35,081,178.00 | 95,529.50 | 13,418.50 | 122,720.00 | 21,120.00 | 17,243.10 | 35,351,209.10 | +// 第9行:| 2023"年"7"月" | 950 | 4415 | 13196 | 18561 | 448880.45 | 1378896.20 | 1827776.65 | 18.90 | 19.02 | 8484251.00 | 26224348.00 | 34,708,599.00 | 95,029.00 | 12,431.50 | 76,000.00 | 48,540.00 | 16,704.90 | 34,957,304.40 | +// 第10行:| 2023"年"8"月" | 781 | 3591 | 11598 | 15970 | 362697.70 | 1215945.90 | 1578643.60 | 22.61 | 22.76 | 8200693.00 | 27673575.00 | 35,874,268.00 | 70,303.00 | 10,737.00 | 62,480.00 | - 0 | 14,373.00 | 36,032,161.00 | + + +// 请提取以下Excel表格的【列名/表头】: +// +// [表格内容] +// +// 要求: +// 1. 只提取作为数据列标识的【列名】(不是表格的大标题) +// 2. 如果有合并单元格或多层表头,请注意区分 +// 3. 如果第一行是合并的大标题,请跳过,从真正的列名行开始提取 +// 4. 保持列名的原始顺序和格式 + + public static void main(String[] args) throws Exception { + + String excelPath = "D:\\Word\\桌面管理\\20251201\\SJ\\广西千汇食品有限公司-材料\\广西千汇食品有限公司-材料\\00 任期审计材料-韦锦流(整理)9.15\\13.财务数据\\2023.4-2025.6年营业报.xlsx"; + String outputDirPath = "D:\\Word\\桌面管理\\20251201\\SJ\\markdown_output"; + + File file = new File(excelPath); + File outputDir = new File(outputDirPath); + + if (!outputDir.exists() && !outputDir.mkdirs()) { + throw new RuntimeException("无法创建输出目录"); + } + + try (FileInputStream fis = new FileInputStream(file)) { + + Map result = ExcelToMarkdownUtils.convertToMarkdown(fis, file.getName()); + + for (Map.Entry entry : result.entrySet()) { + + String fileName = entry.getKey(); + String markdown = entry.getValue(); + + // 清理非法文件名字符 + fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_"); + + File mdFile = new File(outputDir, fileName + ".md"); + + try (FileWriter writer = new FileWriter(mdFile)) { + writer.write(markdown); + } + + System.out.println("已生成: " + mdFile.getAbsolutePath()); + } + } + + System.out.println("✅ 全部转换完成"); + } +} \ No newline at end of file