Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion machine/corpora/aligned_word_pair.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def format_score(score: float) -> str:
source_index = "NULL" if self.source_index < 0 else str(self.source_index)
target_index = "NULL" if self.target_index < 0 else str(self.target_index)
repr = f"{source_index}-{target_index}"
if include_scores and self.translation_score >= 0:
if include_scores and (self.translation_score >= 0 or self.alignment_score >= 0):
repr += f":{format_score(self.translation_score)}"
if self.alignment_score >= 0:
repr += f":{format_score(self.alignment_score)}"
Expand Down
58 changes: 51 additions & 7 deletions machine/jobs/eflomal_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def execute_eflomal(
target_path: Path,
forward_links_path: Path,
reverse_links_path: Path,
forward_scores_output: Path,
reverse_scores_output: Path,
n_iterations: Tuple[int, int, int],
) -> None:
if not is_eflomal_available():
Expand All @@ -49,6 +51,10 @@ def execute_eflomal(
str(forward_links_path),
"-r",
str(reverse_links_path),
"-F",
str(forward_scores_output),
"-R",
str(reverse_scores_output),
# "-q",
"-m",
"3",
Expand Down Expand Up @@ -107,6 +113,30 @@ def normalize_for_alignment(sent: Sequence[str]) -> str:
return " ".join(lowercase(normalize("NFC", escape_spaces(sent))))


def compute_aligned_word_pair_scores(
forward_matrix: WordAlignmentMatrix,
forward_sentence_score: float,
reverse_sentence_score: float,
) -> str:
# Get the sentence score as 0.0-1.0, from the average logp sentence score
avg_logp = (forward_sentence_score + reverse_sentence_score) / 2.0
sentence_score = 1.0 / (1.0 + abs(avg_logp))

scored: List[AlignedWordPair] = []
forward_pairs = forward_matrix.to_aligned_word_pairs()
for word_pair in forward_pairs:
scored.append(
AlignedWordPair(
word_pair.source_index,
word_pair.target_index,
translation_score=-1,
alignment_score=sentence_score,
)
)

return " ".join(str(wp) for wp in scored)


# From silnlp.alignment.eflomal
class EflomalAligner:
def __init__(self, model_dir: Path) -> None:
Expand Down Expand Up @@ -138,22 +168,30 @@ def train(self, src_toks: Sequence[Sequence[str]], trg_toks: Sequence[Sequence[s
trg_eflomal_path,
self._model_dir / "forward-align.txt",
self._model_dir / "reverse-align.txt",
self._model_dir / "forward-scores.txt",
self._model_dir / "reverse-scores.txt",
n_iterations,
)

def align(self, sym_heuristic: str = "grow-diag-final-and") -> List[str]:
forward_align_path = self._model_dir / "forward-align.txt"
reverse_align_path = self._model_dir / "reverse-align.txt"
forward_scores_path = self._model_dir / "forward-scores.txt"
reverse_scores_path = self._model_dir / "reverse-scores.txt"

alignments = []
heuristic = SymmetrizationHeuristic[sym_heuristic.upper().replace("-", "_")]
with ExitStack() as stack:
forward_file = stack.enter_context(forward_align_path.open("r", encoding="utf-8-sig"))
reverse_file = stack.enter_context(reverse_align_path.open("r", encoding="utf-8-sig"))

for forward_line, reverse_line in zip(forward_file, reverse_file):
forward_matrix = to_word_alignment_matrix(forward_line.strip())
reverse_matrix = to_word_alignment_matrix(reverse_line.strip())
forward_align_file = stack.enter_context(forward_align_path.open("r", encoding="utf-8-sig"))
reverse_align_file = stack.enter_context(reverse_align_path.open("r", encoding="utf-8-sig"))
forward_scores_file = stack.enter_context(forward_scores_path.open("r", encoding="utf-8-sig"))
reverse_scores_file = stack.enter_context(reverse_scores_path.open("r", encoding="utf-8-sig"))

for forward_align_line, reverse_align_line, forward_sentence_score, reverse_sentence_score in zip(
forward_align_file, reverse_align_file, forward_scores_file, reverse_scores_file
):
forward_matrix = to_word_alignment_matrix(str(forward_align_line.strip()))
reverse_matrix = to_word_alignment_matrix(str(reverse_align_line.strip()))
src_len = max(forward_matrix.row_count, reverse_matrix.row_count)
trg_len = max(forward_matrix.column_count, reverse_matrix.column_count)

Expand All @@ -162,6 +200,12 @@ def align(self, sym_heuristic: str = "grow-diag-final-and") -> List[str]:

forward_matrix.symmetrize_with(reverse_matrix, heuristic)

alignments.append(str(forward_matrix))
scored_word_pairs = compute_aligned_word_pair_scores(
forward_matrix,
float(forward_sentence_score.strip()),
float(reverse_sentence_score.strip()),
)

alignments.append(scored_word_pairs)

return alignments
Loading