1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- package cn.jlsxwkj.common.reader;
- import cn.jlsxwkj.common.utils.SplitDocument;
- import lombok.Data;
- import org.springframework.ai.document.Document;
- import org.springframework.ai.document.DocumentReader;
- import org.springframework.core.io.DefaultResourceLoader;
- import org.springframework.core.io.Resource;
- import org.springframework.util.StreamUtils;
- import java.io.IOException;
- import java.nio.charset.Charset;
- import java.nio.charset.StandardCharsets;
- import java.util.List;
- import java.util.Objects;
- /**
- * @author zh
- * 文本读取
- */
- @Data
- public class ParagraphTextReader implements DocumentReader {
- private final Resource resource;
- private final Charset charset = StandardCharsets.UTF_8;
- /**
- * 窗口大小,为段落的数量,用于滚动读取
- */
- private final int windowSize;
- public ParagraphTextReader(String resourceUrl,
- int windowSize) {
- this(new DefaultResourceLoader().getResource(resourceUrl), windowSize);
- }
- public ParagraphTextReader(Resource resource, int windowSize) {
- Objects.requireNonNull(resource, "The Spring Resource must not be null");
- this.resource = resource;
- this.windowSize = windowSize;
- }
- /**
- * 读取文本内容,并根据换行进行分段,采用窗口模式,窗口为段落的数量
- *
- * @return 文档信息列表
- */
- @Override
- public List<Document> get() {
- try {
- var document = StreamUtils.copyToString(this.resource.getInputStream(), this.charset);
- return SplitDocument.splitStringToListDocument(document, this.windowSize, this.resource.getFilename());
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
|