From d0c8eea5cd38302e32c0bca4efa1136bf154ddb3 Mon Sep 17 00:00:00 2001 From: zws <2985693012@qq.com> Date: Wed, 29 Oct 2025 14:28:22 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E4=BA=86=E4=B8=80?= =?UTF-8?q?=E4=B8=AA=20=E5=86=99=E5=85=A5=20Word=20=E6=96=87=E6=A1=A3?= =?UTF-8?q?=EF=BC=88.docx=EF=BC=89=20=E7=9A=84=E5=B7=A5=E5=85=B7=E6=A8=A1?= =?UTF-8?q?=E5=9D=97=E3=80=82=20=E8=AF=A5=E5=B7=A5=E5=85=B7=E8=83=BD?= =?UTF-8?q?=E5=A4=9F=E5=B0=86=E7=94=9F=E6=88=90=E7=9A=84=E6=8A=A5=E5=91=8A?= =?UTF-8?q?=E5=86=85=E5=AE=B9=E4=BB=A5=20.docx=20=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E8=87=AA=E5=8A=A8=E5=86=99=E5=87=BA=E5=88=B0=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=E6=88=96=E6=8C=87=E5=AE=9A=E8=B7=AF=E5=BE=84=EF=BC=8C=E6=96=B9?= =?UTF-8?q?=E4=BE=BF=E7=94=A8=E6=88=B7=E7=9B=B4=E6=8E=A5=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=8F=AF=E7=BC=96=E8=BE=91=E7=9A=84=E6=8A=A5=E5=91=8A=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E3=80=82=20=E6=96=B0=E5=A2=9E=E4=BA=86=20word=5Fwrite?= =?UTF-8?q?r=20=E5=B7=A5=E5=85=B7=E6=96=87=E4=BB=B6=EF=BC=8C=E7=94=A8?= =?UTF-8?q?=E4=BA=8E=E5=B0=86=E6=96=87=E6=9C=AC=E5=86=85=E5=AE=B9=E5=AF=BC?= =?UTF-8?q?=E5=87=BA=E4=B8=BA=20Word=20=E6=96=87=E6=A1=A3=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PS: 我是第一次提交 PR,如果有任何不合适或不规范的地方,请多多指正! 我会积极修改并学习改进,非常感谢你的时间与帮助! --- .../tool/common_tool/write_word_tool.py | 62 +++++++++++++ .../agent/action/tool/test_write_word_tool.py | 89 +++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 agentuniverse/agent/action/tool/common_tool/write_word_tool.py create mode 100644 tests/test_agentuniverse/unit/agent/action/tool/test_write_word_tool.py diff --git a/agentuniverse/agent/action/tool/common_tool/write_word_tool.py b/agentuniverse/agent/action/tool/common_tool/write_word_tool.py new file mode 100644 index 00000000..4b7cade2 --- /dev/null +++ b/agentuniverse/agent/action/tool/common_tool/write_word_tool.py @@ -0,0 +1,62 @@ +import os +import json +from typing import Any, Dict + +from agentuniverse.agent.action.tool.tool import Tool + + +class WriteWordDocumentTool(Tool): + def execute(self, file_path: str, content: str = "", append: bool = False) -> str: + directory = os.path.dirname(file_path) + if directory and not os.path.exists(directory): + try: + os.makedirs(directory, exist_ok=True) + except Exception as e: + return json.dumps( + {"error": f"Failed to create directory: {str(e)}", "file_path": file_path, "status": "error"} + ) + + try: + from docx import Document # type: ignore + except ImportError as e: + return json.dumps( + { + "error": f"python-docx is required to write Word documents: {str(e)}", + "file_path": file_path, + "status": "error", + } + ) + + if not file_path.lower().endswith(".docx"): + return json.dumps( + {"error": "The target file must have a .docx extension.", "file_path": file_path, "status": "error"} + ) + + document = None + if append and os.path.exists(file_path): + try: + document = Document(file_path) + except Exception as e: + return json.dumps( + {"error": f"Failed to load existing document: {str(e)}", "file_path": file_path, "status": "error"} + ) + else: + document = Document() + + try: + document.add_paragraph(content) + document.save(file_path) + file_size = os.path.getsize(file_path) + return json.dumps( + { + "file_path": file_path, + "bytes_written": len(content.encode("utf-8")), + "file_size": file_size, + "append_mode": append, + "status": "success", + } + ) + except Exception as e: + return json.dumps( + {"error": f"Failed to write document: {str(e)}", "file_path": file_path, "status": "error"} + ) diff --git a/tests/test_agentuniverse/unit/agent/action/tool/test_write_word_tool.py b/tests/test_agentuniverse/unit/agent/action/tool/test_write_word_tool.py new file mode 100644 index 00000000..c987aadd --- /dev/null +++ b/tests/test_agentuniverse/unit/agent/action/tool/test_write_word_tool.py @@ -0,0 +1,89 @@ +import os +import json +import tempfile +import unittest +from agentuniverse.agent.action.tool.common_tool.write_word_tool import WriteWordDocumentTool + + +class WriteWordDocumentToolTest(unittest.TestCase): + def setUp(self): + self.tool = WriteWordDocumentTool() + self.temp_dir = tempfile.mkdtemp() + + def tearDown(self): + for root, dirs, files in os.walk(self.temp_dir, topdown=False): + for name in files: + os.unlink(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + os.rmdir(self.temp_dir) + + def test_write_new_word_file(self): + file_path = os.path.join(self.temp_dir, "test_new.docx") + content = "***This is a test paragraph.***" + + result_json = self.tool.execute(file_path=file_path, content=content, append=False) + result = json.loads(result_json) + + self.assertEqual(result["status"], "success") + self.assertEqual(result["file_path"], file_path) + self.assertTrue(os.path.exists(file_path)) + + def test_append_to_word_file(self): + file_path = os.path.join(self.temp_dir, "test_append.docx") + + initial_content = "Initial paragraph." + self.tool.execute(file_path=file_path, content=initial_content, append=False) + + append_content = "Appended paragraph." + result_json = self.tool.execute(file_path=file_path, content=append_content, append=True) + result = json.loads(result_json) + + self.assertEqual(result["status"], "success") + self.assertEqual(result["append_mode"], True) + + def test_invalid_file_extension(self): + file_path = os.path.join(self.temp_dir, "invalid_file.txt") + content = "This should fail." + + result_json = self.tool.execute(file_path=file_path, content=content, append=False) + result = json.loads(result_json) + + self.assertEqual(result["status"], "error") + self.assertIn("The target file must have a .docx extension.", result["error"]) + + def test_create_directory_structure(self): + file_path = os.path.join(self.temp_dir, "nested/dir/structure/test.docx") + content = "Test content in nested directory." + + result_json = self.tool.execute(file_path=file_path, content=content, append=False) + result = json.loads(result_json) + + self.assertEqual(result["status"], "success") + self.assertTrue(os.path.exists(file_path)) + self.assertTrue(os.path.isdir(os.path.join(self.temp_dir, "nested/dir/structure"))) + + def test_missing_dependency(self): + original_import = __import__ + + def mock_import(name, *args): + if name == "docx": + raise ImportError("No module named 'docx'") + return original_import(name, *args) + + try: + __builtins__["__import__"] = mock_import + file_path = os.path.join(self.temp_dir, "test_missing_dependency.docx") + content = "This should fail due to missing dependency." + + result_json = self.tool.execute(file_path=file_path, content=content, append=False) + result = json.loads(result_json) + + self.assertEqual(result["status"], "error") + self.assertIn("python-docx is required to write Word documents", result["error"]) + finally: + __builtins__["__import__"] = original_import + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 40b863536621a5b0775e45c4b754dd2499f5ba3c Mon Sep 17 00:00:00 2001 From: Sunflower <129384209+SunFlowerUE@users.noreply.github.com> Date: Thu, 30 Oct 2025 11:43:25 +0800 Subject: [PATCH 2/2] Enhance reader text handling with encoding detection --- .../knowledge/reader/file/csv_reader.py | 77 +++++++++++-------- .../knowledge/reader/file/file_reader.py | 7 ++ .../knowledge/reader/file/txt_reader.py | 16 ++-- .../action/knowledge/reader/reader_manager.py | 3 + .../agent/action/knowledge/reader/utils.py | 69 +++++++++++++++++ .../reader/file/test_text_encoding.py | 38 +++++++++ 6 files changed, 168 insertions(+), 42 deletions(-) create mode 100644 agentuniverse/agent/action/knowledge/reader/utils.py create mode 100644 tests/test_agentuniverse/unit/agent/action/knowledge/reader/file/test_text_encoding.py diff --git a/agentuniverse/agent/action/knowledge/reader/file/csv_reader.py b/agentuniverse/agent/action/knowledge/reader/file/csv_reader.py index 4888886a..d37d24c4 100644 --- a/agentuniverse/agent/action/knowledge/reader/file/csv_reader.py +++ b/agentuniverse/agent/action/knowledge/reader/file/csv_reader.py @@ -6,71 +6,80 @@ # @FileName: csv_reader.py import csv +import io from pathlib import Path -from typing import List, Union, Optional, Dict +from typing import List, Union, Optional, Dict, TextIO from agentuniverse.agent.action.knowledge.reader.reader import Reader from agentuniverse.agent.action.knowledge.store.document import Document +from agentuniverse.agent.action.knowledge.reader.utils import detect_file_encoding class CSVReader(Reader): """CSV file reader. - + Used to read and parse CSV format files, supports local file paths or file objects as input. """ - def _load_data(self, - file: Union[str, Path], + def _load_data(self, + file: Union[str, Path, TextIO], delimiter: str = ",", quotechar: str = '"', ext_info: Optional[Dict] = None) -> List[Document]: - """Parse CSV file. - - Args: - file: CSV file path or file object - delimiter: CSV delimiter, default is comma - quotechar: Quote character, default is double quote - ext_info: Additional metadata information - - Returns: - List[Document]: List of documents containing CSV content - - Raises: - FileNotFoundError: Raised when file does not exist - ValueError: Raised when file reading fails - """ + """Parse CSV file.""" try: + text_stream: TextIO + should_close = False + if isinstance(file, str): file = Path(file) - + if isinstance(file, Path): if not file.exists(): raise FileNotFoundError(f"File not found: {file}") - file_content = file.open(newline="", mode="r", encoding="utf-8") + encoding = detect_file_encoding(file) + text_stream = file.open(newline="", mode="r", encoding=encoding) + should_close = True + elif hasattr(file, "read"): + try: + file.seek(0) + except (AttributeError, OSError): + pass + raw_content = file.read() + if isinstance(raw_content, bytes): + encoding = detect_file_encoding(raw_content) + text_stream = io.StringIO(raw_content.decode(encoding)) + elif isinstance(raw_content, str): + text_stream = io.StringIO(raw_content) + else: + raise ValueError("Unsupported file object type") + should_close = True else: - file.seek(0) - file_content = file + raise TypeError("file must be a path string, Path, or file-like object") - csv_content = [] - with file_content as csvfile: - csv_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) + csv_content: List[str] = [] + try: + csv_reader = csv.reader(text_stream, delimiter=delimiter, quotechar=quotechar) for row in csv_reader: - # Filter out completely empty rows if any(cell.strip() for cell in row): - # Remove empty values at the end of row while row and not row[-1].strip(): row.pop() - # Only add non-empty values to result csv_content.append(", ".join(filter(None, row))) - - # Combine all valid rows into final text + finally: + if should_close: + text_stream.close() + final_content = "\n".join(csv_content) - # Get metadata - metadata = {"file_name": getattr(file, 'name', 'unknown')} + if isinstance(file, Path): + file_name = file.name + else: + name_attr = getattr(file, 'name', None) + file_name = Path(name_attr).name if isinstance(name_attr, str) else 'unknown' + metadata = {"file_name": file_name} if ext_info: metadata.update(ext_info) - # print(f"csv_content: {final_content} \n metadata: {metadata}") + return [Document(text=final_content, metadata=metadata)] except Exception as e: raise ValueError(f"Failed to read CSV file: {str(e)}") from e diff --git a/agentuniverse/agent/action/knowledge/reader/file/file_reader.py b/agentuniverse/agent/action/knowledge/reader/file/file_reader.py index 581d323e..5f7acd07 100644 --- a/agentuniverse/agent/action/knowledge/reader/file/file_reader.py +++ b/agentuniverse/agent/action/knowledge/reader/file/file_reader.py @@ -10,8 +10,11 @@ from typing import Dict, Type, List, Optional from agentuniverse.agent.action.knowledge.reader.file.docx_reader import DocxReader from agentuniverse.agent.action.knowledge.reader.file.epub_reader import EpubReader +from agentuniverse.agent.action.knowledge.reader.file.markdown_reader import MarkdownReader from agentuniverse.agent.action.knowledge.reader.file.pdf_reader import PdfReader from agentuniverse.agent.action.knowledge.reader.file.pptx_reader import PptxReader +from agentuniverse.agent.action.knowledge.reader.file.txt_reader import TxtReader +from agentuniverse.agent.action.knowledge.reader.file.csv_reader import CSVReader from agentuniverse.agent.action.knowledge.reader.file.xlsx_reader import XlsxReader from agentuniverse.agent.action.knowledge.reader.reader import Reader from agentuniverse.agent.action.knowledge.store.document import Document @@ -22,6 +25,10 @@ DEFAULT_FILE_READERS: Dict[str, Type[Reader]] = { ".pptx": PptxReader, ".xlsx": XlsxReader, ".epub": EpubReader, + ".txt": TxtReader, + ".md": MarkdownReader, + ".markdown": MarkdownReader, + ".csv": CSVReader, } diff --git a/agentuniverse/agent/action/knowledge/reader/file/txt_reader.py b/agentuniverse/agent/action/knowledge/reader/file/txt_reader.py index 6373902a..d837277c 100644 --- a/agentuniverse/agent/action/knowledge/reader/file/txt_reader.py +++ b/agentuniverse/agent/action/knowledge/reader/file/txt_reader.py @@ -1,19 +1,19 @@ - from pathlib import Path from typing import List, Optional, Dict from agentuniverse.agent.action.knowledge.reader.reader import Reader from agentuniverse.agent.action.knowledge.store.document import Document +from agentuniverse.agent.action.knowledge.reader.utils import detect_file_encoding class LineTxtReader(Reader): def _load_data(self, fpath: Path, ext_info: Optional[Dict] = None) -> List[Document]: - dlist = [] + dlist: List[Document] = [] + encoding = detect_file_encoding(fpath) - with open(fpath, 'r', encoding='utf-8') as file: - - metadata = {"file_name": file.name} + with open(fpath, 'r', encoding=encoding) as file: + metadata = {"file_name": Path(file.name).name} if ext_info is not None: metadata.update(ext_info) @@ -27,10 +27,10 @@ class TxtReader(Reader): """Txt reader.""" def _load_data(self, fpath: Path, ext_info: Optional[Dict] = None) -> List[Document]: + encoding = detect_file_encoding(fpath) - with open(fpath, 'r', encoding='utf-8') as file: - - metadata = {"file_name": file.name} + with open(fpath, 'r', encoding=encoding) as file: + metadata = {"file_name": Path(file.name).name} if ext_info is not None: metadata.update(ext_info) diff --git a/agentuniverse/agent/action/knowledge/reader/reader_manager.py b/agentuniverse/agent/action/knowledge/reader/reader_manager.py index 21a95ccf..4eaee2a4 100644 --- a/agentuniverse/agent/action/knowledge/reader/reader_manager.py +++ b/agentuniverse/agent/action/knowledge/reader/reader_manager.py @@ -21,6 +21,9 @@ class ReaderManager(ComponentManagerBase[Reader]): "pptx": "default_pptx_reader", "docx": "default_docx_reader", "txt": "default_txt_reader", + "md": "default_markdown_reader", + "markdown": "default_markdown_reader", + "csv": "default_csv_reader", # extended defaults for web & images "url": "default_web_page_reader", "png": "default_image_ocr_reader", diff --git a/agentuniverse/agent/action/knowledge/reader/utils.py b/agentuniverse/agent/action/knowledge/reader/utils.py new file mode 100644 index 00000000..48ada34f --- /dev/null +++ b/agentuniverse/agent/action/knowledge/reader/utils.py @@ -0,0 +1,69 @@ +"""Utility helpers for reader implementations.""" +from __future__ import annotations + +from pathlib import Path +from typing import BinaryIO, Iterable, Sequence, Union + +# Candidate encodings to try when automatic detection libraries are not available. +_FALLBACK_ENCODINGS: Sequence[str] = ( + "utf-8", + "utf-8-sig", + "gb18030", + "gbk", + "big5", + "shift_jis", + "latin-1", +) + + +def _read_sample_bytes(source: Union[str, Path, BinaryIO, bytes, bytearray], + sample_size: int) -> bytes: + """Read a byte sample from the given file path or binary handle.""" + if isinstance(source, (bytes, bytearray)): + return bytes(source[:sample_size]) + if isinstance(source, (str, Path)): + path = Path(source) + with path.open("rb") as handle: + return handle.read(sample_size) + + # File-like object – preserve the original pointer + handle = source + current_pos = handle.tell() + try: + data = handle.read(sample_size) + finally: + handle.seek(current_pos) + return data if data is not None else b"" + + +def detect_file_encoding(source: Union[str, Path, BinaryIO, bytes, bytearray], + sample_size: int = 32 * 1024, + fallback_encodings: Iterable[str] = _FALLBACK_ENCODINGS) -> str: + """Best-effort detection of the text encoding for the given file.""" + sample = _read_sample_bytes(source, sample_size) + if not sample: + return "utf-8" + + # First try decoding with a curated list of encodings + for encoding in fallback_encodings: + try: + sample.decode(encoding) + return encoding + except UnicodeDecodeError: + continue + + # If the curated list fails, fall back to charset_normalizer if available + try: # pragma: no cover - optional dependency + from charset_normalizer import from_bytes + except ImportError: # pragma: no cover - handled above + best_guess = None + else: + result = from_bytes(sample).best() + best_guess = result.encoding if result is not None else None + if best_guess: + return best_guess + + return "utf-8" + + +__all__ = ["detect_file_encoding"] diff --git a/tests/test_agentuniverse/unit/agent/action/knowledge/reader/file/test_text_encoding.py b/tests/test_agentuniverse/unit/agent/action/knowledge/reader/file/test_text_encoding.py new file mode 100644 index 00000000..2a09e3d6 --- /dev/null +++ b/tests/test_agentuniverse/unit/agent/action/knowledge/reader/file/test_text_encoding.py @@ -0,0 +1,38 @@ +from agentuniverse.agent.action.knowledge.reader.file.csv_reader import CSVReader +from agentuniverse.agent.action.knowledge.reader.file.txt_reader import TxtReader +from agentuniverse.agent.action.knowledge.reader.utils import detect_file_encoding + + +def test_detect_file_encoding_gb18030(tmp_path): + sample_text = "示例文本" + file_path = tmp_path / "sample.txt" + file_path.write_text(sample_text, encoding="gb18030") + + detected = detect_file_encoding(file_path) + assert detected in {"gb18030", "gbk"} + + +def test_txt_reader_handles_gbk(tmp_path): + content = "第一行\n第二行" + file_path = tmp_path / "gbk.txt" + file_path.write_text(content, encoding="gb18030") + + reader = TxtReader() + documents = reader.load_data(file_path) + + assert len(documents) == 1 + assert documents[0].text == content + assert documents[0].metadata["file_name"] == file_path.name + + +def test_csv_reader_handles_utf8_bom(tmp_path): + rows = ["col1,col2", "值1,值2"] + file_path = tmp_path / "data.csv" + file_path.write_text("\n".join(rows), encoding="utf-8-sig") + + reader = CSVReader() + documents = reader.load_data(file_path) + + assert len(documents) == 1 + assert "值1" in documents[0].text + assert documents[0].metadata["file_name"] == file_path.name