diff --git a/README.md b/README.md index 06f052b..159877a 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ pip install semantra ## Usage Semantra operates on collections of documents — text or PDF files — stored on your local computer. +Word `.docx` and PowerPoint `.pptx` files can also be indexed as extracted text. At its simplest, you can run Semantra over a single document by running: diff --git a/src/semantra/office.py b/src/semantra/office.py new file mode 100644 index 0000000..10736f3 --- /dev/null +++ b/src/semantra/office.py @@ -0,0 +1,60 @@ +import re +import zipfile +import xml.etree.ElementTree as ET + + +class OfficeContent: + def __init__(self, rawtext, filename): + self.rawtext = rawtext + self.filename = filename + self.filetype = "text" + + +def get_office_content(filename): + if filename.endswith(".docx"): + rawtext = get_docx_text(filename) + elif filename.endswith(".pptx"): + rawtext = get_pptx_text(filename) + else: + raise ValueError(f"Unsupported Office file: {filename}") + return OfficeContent(rawtext, filename) + + +def get_docx_text(filename): + with zipfile.ZipFile(filename) as docx: + return get_xml_text(docx.read("word/document.xml")) + + +def get_pptx_text(filename): + with zipfile.ZipFile(filename) as pptx: + slide_names = [ + name + for name in pptx.namelist() + if re.match(r"ppt/slides/slide[0-9]+\.xml$", name) + ] + slide_names.sort(key=slide_sort_key) + return "\n\n".join(get_xml_text(pptx.read(name)) for name in slide_names) + + +def slide_sort_key(name): + return int(re.search(r"slide([0-9]+)\.xml$", name).group(1)) + + +def get_xml_text(xml_bytes): + root = ET.fromstring(xml_bytes) + paragraphs = [] + for paragraph in root.iter(): + if get_local_name(paragraph.tag) != "p": + continue + text = "".join( + node.text or "" + for node in paragraph.iter() + if get_local_name(node.tag) == "t" + ) + if text.strip(): + paragraphs.append(text) + return "\n".join(paragraphs) + + +def get_local_name(tag): + return tag.rsplit("}", 1)[-1] diff --git a/src/semantra/semantra.py b/src/semantra/semantra.py index 9636117..4db4b02 100644 --- a/src/semantra/semantra.py +++ b/src/semantra/semantra.py @@ -12,6 +12,7 @@ from tqdm import tqdm from .models import BaseModel, TransformerModel, as_numpy, models +from .office import get_office_content from .pdf import get_pdf_content from .util import ( HASH_LENGTH, @@ -48,6 +49,8 @@ def __init__(self, rawtext, filename): def get_text_content(md5, filename, semantra_dir, force, silent, encoding): if filename.endswith(".pdf"): return get_pdf_content(md5, filename, semantra_dir, force, silent) + if filename.endswith((".docx", ".pptx")): + return get_office_content(filename) with open(filename, "r", encoding=encoding, errors="ignore") as f: rawtext = f.read()