Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ pip install semantra
## Usage

Semantra operates on collections of documents — text or PDF files — stored on your local computer.
Word `.docx` and PowerPoint `.pptx` files can also be indexed as extracted text.

At its simplest, you can run Semantra over a single document by running:

Expand Down
60 changes: 60 additions & 0 deletions src/semantra/office.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
import zipfile
import xml.etree.ElementTree as ET


class OfficeContent:
def __init__(self, rawtext, filename):
self.rawtext = rawtext
self.filename = filename
self.filetype = "text"


def get_office_content(filename):
if filename.endswith(".docx"):
rawtext = get_docx_text(filename)
elif filename.endswith(".pptx"):
rawtext = get_pptx_text(filename)
else:
raise ValueError(f"Unsupported Office file: {filename}")
return OfficeContent(rawtext, filename)


def get_docx_text(filename):
with zipfile.ZipFile(filename) as docx:
return get_xml_text(docx.read("word/document.xml"))


def get_pptx_text(filename):
with zipfile.ZipFile(filename) as pptx:
slide_names = [
name
for name in pptx.namelist()
if re.match(r"ppt/slides/slide[0-9]+\.xml$", name)
]
slide_names.sort(key=slide_sort_key)
return "\n\n".join(get_xml_text(pptx.read(name)) for name in slide_names)


def slide_sort_key(name):
return int(re.search(r"slide([0-9]+)\.xml$", name).group(1))


def get_xml_text(xml_bytes):
root = ET.fromstring(xml_bytes)
paragraphs = []
for paragraph in root.iter():
if get_local_name(paragraph.tag) != "p":
continue
text = "".join(
node.text or ""
for node in paragraph.iter()
if get_local_name(node.tag) == "t"
)
if text.strip():
paragraphs.append(text)
return "\n".join(paragraphs)


def get_local_name(tag):
return tag.rsplit("}", 1)[-1]
3 changes: 3 additions & 0 deletions src/semantra/semantra.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from tqdm import tqdm

from .models import BaseModel, TransformerModel, as_numpy, models
from .office import get_office_content
from .pdf import get_pdf_content
from .util import (
HASH_LENGTH,
Expand Down Expand Up @@ -48,6 +49,8 @@ def __init__(self, rawtext, filename):
def get_text_content(md5, filename, semantra_dir, force, silent, encoding):
if filename.endswith(".pdf"):
return get_pdf_content(md5, filename, semantra_dir, force, silent)
if filename.endswith((".docx", ".pptx")):
return get_office_content(filename)

with open(filename, "r", encoding=encoding, errors="ignore") as f:
rawtext = f.read()
Expand Down