From fa1430030b5249a221197789a785db963142ec7d Mon Sep 17 00:00:00 2001
From: yuance <182865460@qq.com>
Date: Thu, 5 Mar 2026 10:41:11 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Eexcel=E8=BD=ACmarkdown?=
 =?UTF-8?q?=E5=B7=A5=E5=85=B7=E7=B1=BB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../ai/utils/ExcelToMarkdownUtils.java        | 258 ++++++++++++++++++
 1 file changed, 258 insertions(+)
 create mode 100644 src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java

diff --git a/src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java b/src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java
new file mode 100644
index 0000000..f453056
--- /dev/null
+++ b/src/main/java/com/gxwebsoft/ai/utils/ExcelToMarkdownUtils.java
@@ -0,0 +1,258 @@
+package com.gxwebsoft.ai.utils;
+
+import org.apache.poi.ss.usermodel.*;
+import org.springframework.web.multipart.MultipartFile;
+
+import java.io.*;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Excel 转 Markdown 工具类（Apache POI 4+ / 5+）
+ * 支持 MultipartFile 直接转换，不落地生成 Markdown 内容
+ */
+public class ExcelToMarkdownUtils {
+
+    private ExcelToMarkdownUtils() {}
+
+    /**
+     * 将 MultipartFile 中的 Excel 所有工作表转换为 Markdown 内容
+     * 适用于 Spring Web 应用，由框架注入 MultipartFile 参数
+     *
+     * @param file 上传的 Excel 文件（MultipartFile）
+     * @return Map key = "原始文件名-工作表名"，value = 对应工作表的 Markdown 字符串
+     */
+    public static Map<String, String> convertToMarkdown(MultipartFile file) throws IOException {
+    	return convertToMarkdown(file.getInputStream(), file.getOriginalFilename());
+    }
+    
+    public static Map<String, String> convertToMarkdown(InputStream inputStream, String originalFilename) throws IOException {
+        if (originalFilename == null) {
+            originalFilename = "unknown";
+        } else {
+            // 去除扩展名
+            int dotIndex = originalFilename.lastIndexOf('.');
+            if (dotIndex > 0) {
+                originalFilename = originalFilename.substring(0, dotIndex);
+            }
+        }
+
+        Map<String, String> result = new LinkedHashMap<>();
+
+        try (Workbook workbook = WorkbookFactory.create(inputStream)) {
+            DataFormatter formatter = new DataFormatter();
+            int sheetCount = workbook.getNumberOfSheets();
+
+            for (int i = 0; i < sheetCount; i++) {
+                Sheet sheet = workbook.getSheetAt(i);
+                String sheetName = sheet.getSheetName();
+                String key = originalFilename + "-" + sheetName;
+                String markdown = sheetToMarkdown(sheet, formatter);
+                result.put(key, markdown);
+            }
+        }
+
+        return result;
+    }
+
+    /**
+     * 将单个工作表转换为 Markdown 格式字符串
+     */
+    private static String sheetToMarkdown(Sheet sheet, DataFormatter formatter) {
+        StringBuilder md = new StringBuilder();
+        boolean isHeader = true;
+
+        for (Row row : sheet) {
+            if (row == null || row.getPhysicalNumberOfCells() == 0) continue;
+
+            StringBuilder mdLine = new StringBuilder("|");
+            int lastCellNum = row.getLastCellNum();
+            for (int i = 0; i < lastCellNum; i++) {
+                Cell cell = row.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
+                String cellValue = getCellDisplayValue(cell, formatter)
+                        .replace("\n", " ")
+                        .replace("\r", "");
+                mdLine.append(" ").append(cellValue).append(" |");
+            }
+            md.append(mdLine).append("\n");
+
+            if (isHeader) {
+                StringBuilder separator = new StringBuilder("|");
+                for (int i = 0; i < lastCellNum; i++) {
+                    separator.append(" --- |");
+                }
+                md.append(separator).append("\n");
+                isHeader = false;
+            }
+        }
+//        return md.toString();
+        return cleanMarkdownTable(md.toString());
+    }
+
+    /**
+     * 获取单元格显示的文本（对公式单元格强制取缓存结果）
+     */
+    private static String getCellDisplayValue(Cell cell, DataFormatter formatter) {
+        if (cell == null) return "";
+
+        if (cell.getCellType() != CellType.FORMULA) {
+            return formatter.formatCellValue(cell);
+        }
+
+        CellType cachedType = cell.getCachedFormulaResultType();
+        switch (cachedType) {
+            case NUMERIC:
+                if (DateUtil.isCellDateFormatted(cell)) {
+                    return formatter.formatCellValue(cell);
+                } else {
+                    double numericValue = cell.getNumericCellValue();
+                    short formatIndex = cell.getCellStyle().getDataFormat();
+                    String formatString = cell.getCellStyle().getDataFormatString();
+                    return formatter.formatRawCellContents(numericValue, formatIndex, formatString);
+                }
+            case STRING:
+                return cell.getStringCellValue();
+            case BOOLEAN:
+                return String.valueOf(cell.getBooleanCellValue());
+            case ERROR:
+                byte errorCode = cell.getErrorCellValue();
+                return org.apache.poi.ss.usermodel.FormulaError.forInt(errorCode).getString();
+            default:
+                return "";
+        }
+    }
+
+    /**
+     * 清理无效单元格（只删除最右侧连续空列）
+     */
+    private static String cleanMarkdownTable(String markdown) {
+
+        if (markdown == null || markdown.trim().isEmpty()) {
+            return markdown;
+        }
+
+        // 1 按行拆分 + 单元格 trim
+        List<String[]> rows = Arrays.stream(markdown.split("\n"))
+                .map(line -> Arrays.stream(line.split("\\|", -1))
+                        .map(String::trim)
+                        .toArray(String[]::new))
+                .collect(Collectors.toList());
+
+        if (rows.isEmpty()) {
+            return markdown;
+        }
+
+        // 2 删除整行为空的行
+        rows = rows.stream()
+                .filter(row -> Arrays.stream(row)
+                        .anyMatch(cell -> !cell.isEmpty() && !cell.matches("-+")))
+                .collect(Collectors.toList());
+
+        if (rows.isEmpty()) {
+            return "";
+        }
+
+        int columnCount = rows.stream()
+                .mapToInt(r -> r.length)
+                .max()
+                .orElse(0);
+
+        // 3 判断每一列是否全空
+        boolean[] emptyColumns = new boolean[columnCount];
+        Arrays.fill(emptyColumns, true);
+
+        for (String[] row : rows) {
+            for (int i = 0; i < row.length; i++) {
+                if (!row[i].isEmpty() && !row[i].matches("-+")) {
+                    emptyColumns[i] = false;
+                }
+            }
+        }
+
+        // 4 从右往左找最后一个非空列（关键步骤）
+        int lastNonEmptyColumn = columnCount - 1;
+        while (lastNonEmptyColumn >= 0 && emptyColumns[lastNonEmptyColumn]) {
+            lastNonEmptyColumn--;
+        }
+
+        if (lastNonEmptyColumn < 0) {
+            return "";
+        }
+
+        // 5 重建表格，只保留 0 ~ lastNonEmptyColumn
+        StringBuilder result = new StringBuilder();
+
+        for (String[] row : rows) {
+            result.append("|");
+
+            for (int i = 0; i <= lastNonEmptyColumn && i < row.length; i++) {
+                result.append(" ").append(row[i]).append(" |");
+            }
+
+            result.append("\n");
+        }
+
+        return result.toString();
+    }
+
+//    解析后，入知识库时，每行添加行号（序号）"第N行"，帮助大模型理解
+//    这个是个excel的markdown格式，除了标题表头，帮我判断有效内容从第几行开始，给我返回数字，特别注意不要数错了
+//    第1行：| 广西千汇食品有限公司营业月报表 |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+//    第2行：| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+//    第3行：|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+//    第4行：| 日期 | 销售合计 |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+//    第5行：|  | 代宰头数 | 毛边白条 | 边猪 | 合计头数 | 毛边重量 | 边猪重量 | 重量合计 | 毛边单价 元/公斤 | 边猪单价元/公斤 | 毛边金额 | 边猪金额 | 白条猪金额合计(元) | 猪杂 | 苦胆 | 代宰费 | 副产 | 猪血 | 应收金额 |
+//    第6行：| 2023"年"4"月" | 3705 | 2405 | 13296 | 19406 | 243814.00 | 1452236.90 | 1696050.90 | 18.76 | 19.15 | 4573219.00 | 27807875.00 | 32,381,094.00 | 90,678.50 | 7,724.00 | 296,400.00 | 15,120.00 | 3,000.00 | 32,794,016.50 |
+//    第7行：| 2023"年"5"月" | 3826 | 3760 | 14149 | 21735 | 372297.60 | 1524253.60 | 1896551.20 | 18.65 | 18.65 | 6943757.00 | 28427095.00 | 35,370,852.00 | 98,076.00 | 10,639.50 | 306,080.00 | 61,360.00 | 19,561.50 | 35,866,569.00 |
+//    第8行：| 2023"年"6"月" | 1534 | 4593 | 13032 | 19159 | 464744.30 | 1405535.90 | 1870280.20 | 18.65 | 18.79 | 8667447.00 | 26413731.00 | 35,081,178.00 | 95,529.50 | 13,418.50 | 122,720.00 | 21,120.00 | 17,243.10 | 35,351,209.10 |
+//    第9行：| 2023"年"7"月" | 950 | 4415 | 13196 | 18561 | 448880.45 | 1378896.20 | 1827776.65 | 18.90 | 19.02 | 8484251.00 | 26224348.00 | 34,708,599.00 | 95,029.00 | 12,431.50 | 76,000.00 | 48,540.00 | 16,704.90 | 34,957,304.40 |
+//    第10行：| 2023"年"8"月" | 781 | 3591 | 11598 | 15970 | 362697.70 | 1215945.90 | 1578643.60 | 22.61 | 22.76 | 8200693.00 | 27673575.00 | 35,874,268.00 | 70,303.00 | 10,737.00 | 62,480.00 | - 0 | 14,373.00 | 36,032,161.00 |
+    
+    
+//    请提取以下Excel表格的【列名/表头】：
+//
+//    [表格内容]
+//
+//    要求：
+//    1. 只提取作为数据列标识的【列名】（不是表格的大标题）
+//    2. 如果有合并单元格或多层表头，请注意区分
+//    3. 如果第一行是合并的大标题，请跳过，从真正的列名行开始提取
+//    4. 保持列名的原始顺序和格式
+    
+    public static void main(String[] args) throws Exception {
+
+        String excelPath = "D:\\Word\\桌面管理\\20251201\\SJ\\广西千汇食品有限公司-材料\\广西千汇食品有限公司-材料\\00 任期审计材料-韦锦流（整理）9.15\\13.财务数据\\2023.4-2025.6年营业报.xlsx";
+        String outputDirPath = "D:\\Word\\桌面管理\\20251201\\SJ\\markdown_output";
+
+        File file = new File(excelPath);
+        File outputDir = new File(outputDirPath);
+
+        if (!outputDir.exists() && !outputDir.mkdirs()) {
+            throw new RuntimeException("无法创建输出目录");
+        }
+
+        try (FileInputStream fis = new FileInputStream(file)) {
+
+            Map<String, String> result = ExcelToMarkdownUtils.convertToMarkdown(fis, file.getName());
+
+            for (Map.Entry<String, String> entry : result.entrySet()) {
+
+                String fileName = entry.getKey();
+                String markdown = entry.getValue();
+
+                // 清理非法文件名字符
+                fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");
+
+                File mdFile = new File(outputDir, fileName + ".md");
+
+                try (FileWriter writer = new FileWriter(mdFile)) {
+                    writer.write(markdown);
+                }
+
+                System.out.println("已生成: " + mdFile.getAbsolutePath());
+            }
+        }
+
+        System.out.println("✅ 全部转换完成");
+    }
+}
\ No newline at end of file