ParagraphTextReader.java 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. package cn.jlsxwkj.common.reader;
  2. import cn.jlsxwkj.common.utils.SplitDocument;
  3. import lombok.Data;
  4. import org.springframework.ai.document.Document;
  5. import org.springframework.ai.document.DocumentReader;
  6. import org.springframework.core.io.DefaultResourceLoader;
  7. import org.springframework.core.io.Resource;
  8. import org.springframework.util.StreamUtils;
  9. import java.io.IOException;
  10. import java.nio.charset.Charset;
  11. import java.nio.charset.StandardCharsets;
  12. import java.util.List;
  13. import java.util.Objects;
  14. /**
  15. * @author zh
  16. * 文本读取
  17. */
  18. @Data
  19. public class ParagraphTextReader implements DocumentReader {
  20. private final Resource resource;
  21. private final Charset charset = StandardCharsets.UTF_8;
  22. /**
  23. * 窗口大小,为段落的数量,用于滚动读取
  24. */
  25. private final int windowSize;
  26. public ParagraphTextReader(String resourceUrl,
  27. int windowSize) {
  28. this(new DefaultResourceLoader().getResource(resourceUrl), windowSize);
  29. }
  30. public ParagraphTextReader(Resource resource, int windowSize) {
  31. Objects.requireNonNull(resource, "The Spring Resource must not be null");
  32. this.resource = resource;
  33. this.windowSize = windowSize;
  34. }
  35. /**
  36. * 读取文本内容,并根据换行进行分段,采用窗口模式,窗口为段落的数量
  37. *
  38. * @return 文档信息列表
  39. */
  40. @Override
  41. public List<Document> get() {
  42. try {
  43. var document = StreamUtils.copyToString(this.resource.getInputStream(), this.charset);
  44. return SplitDocument.splitStringToListDocument(document, this.windowSize, this.resource.getFilename());
  45. } catch (IOException e) {
  46. throw new RuntimeException(e);
  47. }
  48. }
  49. }