Test.java 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. package org.example.lc;
  2. import org.apache.pdfbox.Loader;
  3. import org.apache.pdfbox.pdmodel.PDDocument;
  4. import technology.tabula.*;
  5. import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
  6. import java.io.File;
  7. import java.util.ArrayList;
  8. import java.util.List;
  9. import java.util.stream.Collectors;
  10. /**
  11. *
  12. */
  13. public class Test {
  14. private List<String> readPdfTables(String filename, String startMarker, String endMarker) throws Exception {
  15. PDDocument document = Loader.loadPDF(new File(filename));
  16. List<String> tableRows = new ArrayList<>();
  17. SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
  18. PageIterator pi = new ObjectExtractor(document).extract();
  19. boolean endFlag = false;
  20. while (pi.hasNext()) {
  21. // iterate over the pages of the document
  22. Page page = pi.next();
  23. List<Table> table = sea.extract(page);
  24. // iterate over the tables of the page
  25. for (Table tables : table) {
  26. if (endFlag) {
  27. break;
  28. }
  29. boolean containsStartMarker = false;
  30. boolean containsEndMarker = false;
  31. List<List<RectangularTextContainer>> rows = tables.getRows();
  32. // iterate over the rows of the table
  33. for (List<RectangularTextContainer> cells : rows) {
  34. String rowStr = cells.stream()
  35. .map(item -> item.getText().replace("\r", " ") + "|")
  36. .collect(Collectors.joining());
  37. if (rowStr.contains(startMarker)) {
  38. containsStartMarker = true;
  39. containsEndMarker = false;
  40. }
  41. if (rowStr.contains(endMarker)) {
  42. containsStartMarker = false;
  43. containsEndMarker = true;
  44. }
  45. if (containsStartMarker) {
  46. tableRows.add(rowStr);
  47. }
  48. if (containsEndMarker) {
  49. tableRows.add(rowStr);
  50. endFlag = true;
  51. break;
  52. }
  53. }
  54. }
  55. }
  56. return tableRows;
  57. }
  58. public static String generatePipeString(int numberOfPipes) {
  59. if (numberOfPipes <= 0) {
  60. return "";
  61. }
  62. StringBuilder sb = new StringBuilder();
  63. for (int i = 0; i < numberOfPipes; i++) {
  64. sb.append("-"); // 添加横线
  65. if (i < numberOfPipes - 1) {
  66. sb.append("|"); // 添加竖线,除了最后一个
  67. }
  68. }
  69. return sb.toString();
  70. }
  71. public static int countOccurrences(String str, String ch) {
  72. if (str == null || str.isEmpty()) {
  73. return 0;
  74. }
  75. // 使用 split 方法分割字符串,并计算分割出的部分数量
  76. String[] parts = str.split(ch, -1);
  77. return parts.length - 1; // 出现次数等于分割部分数量减去 1
  78. }
  79. /**
  80. * @param args
  81. */
  82. public static void main(String[] args) {
  83. Test t = new Test();
  84. String filePath = "C:\\Users\\Acer\\Desktop\\新员工资料\\3、项目\\1、邮储POC\\迭代资料\\工元致远2021年第一期汽车分期绿色资产证券化信托受托机构报告2022年第7期(总第10期).pdf";
  85. String startMarker = "利息兑付情况";
  86. String endMarker = "31,087,874.27|31,087,874.27";
  87. try {
  88. List<String> rows = t.readPdfTables(filePath, startMarker, endMarker);
  89. if (rows.size() > 0) {
  90. //根据第一行内容,添加markdown表格分隔符,放置到第二行
  91. String pipeCharacter = "\\|";
  92. // 统计字符串中 | 的数量
  93. int count = countOccurrences(rows.get(0), pipeCharacter);
  94. rows.add(1, generatePipeString(count));
  95. }
  96. for (String row : rows) {
  97. System.out.println(row);
  98. }
  99. } catch (Exception e) {
  100. // TODO Auto-generated catch block
  101. e.printStackTrace();
  102. }
  103. }
  104. }