From cb2133d0fdaa66505051acc077f7b17617c2e608 Mon Sep 17 00:00:00 2001
From: Augustin Zidek <augustinzidek@google.com>
Date: Tue, 19 Nov 2024 15:12:23 +0000
Subject: [PATCH] Convert Stockholm to A3M without reading the whole file into
 RAM

PiperOrigin-RevId: 698003594
---
 src/alphafold3/data/parsers.py         | 10 +++--
 src/alphafold3/data/tools/hmmsearch.py |  8 ++--
 src/alphafold3/data/tools/jackhmmer.py |  7 ++--
 src/alphafold3/data/tools/nhmmer.py    | 57 +++++++++++++-------------
 4 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/src/alphafold3/data/parsers.py b/src/alphafold3/data/parsers.py
index 183d4ff..8389810 100644
--- a/src/alphafold3/data/parsers.py
+++ b/src/alphafold3/data/parsers.py
@@ -11,7 +11,7 @@
 """Functions for parsing various file formats."""
 
 from collections.abc import Iterable, Sequence
-from typing import TypeAlias
+from typing import IO, TypeAlias
 
 from alphafold3.cpp import fasta_iterator
 from alphafold3.cpp import msa_conversion
@@ -102,7 +102,7 @@ def convert_a3m_to_stockholm(a3m: str, max_seqs: int | None = None) -> str:
 
 
 def convert_stockholm_to_a3m(
-    stockholm_format: str,
+    stockholm: IO[str],
     max_sequences: int | None = None,
     remove_first_row_gaps: bool = True,
     linewidth: int | None = None,
@@ -115,7 +115,7 @@ def convert_stockholm_to_a3m(
   if linewidth is not None and linewidth <= 0:
     raise ValueError('linewidth must be > 0 or None')
 
-  for line in stockholm_format.splitlines():
+  for line in stockholm:
     reached_max_sequences = max_sequences and len(sequences) >= max_sequences
     line = line.strip()
     # Ignore blank lines, markup and end symbols - remainder are alignment
@@ -129,7 +129,9 @@ def convert_stockholm_to_a3m(
       sequences[seqname] = ''
     sequences[seqname] += aligned_seq
 
-  for line in stockholm_format.splitlines():
+  stockholm.seek(0)
+  for line in stockholm:
+    line = line.strip()
     if line[:4] == '#=GS':
       # Description row - example format is:
       # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
diff --git a/src/alphafold3/data/tools/hmmsearch.py b/src/alphafold3/data/tools/hmmsearch.py
index adab18d..a9bbde6 100644
--- a/src/alphafold3/data/tools/hmmsearch.py
+++ b/src/alphafold3/data/tools/hmmsearch.py
@@ -125,11 +125,9 @@ class Hmmsearch(object):
       )
 
       with open(sto_out_path) as f:
-        sto_out = f.read()
-
-      a3m_out = parsers.convert_stockholm_to_a3m(
-          sto_out, remove_first_row_gaps=False, linewidth=60
-      )
+        a3m_out = parsers.convert_stockholm_to_a3m(
+            f, remove_first_row_gaps=False, linewidth=60
+        )
 
     return a3m_out
 
diff --git a/src/alphafold3/data/tools/jackhmmer.py b/src/alphafold3/data/tools/jackhmmer.py
index d1771a3..3603450 100644
--- a/src/alphafold3/data/tools/jackhmmer.py
+++ b/src/alphafold3/data/tools/jackhmmer.py
@@ -125,10 +125,9 @@ class Jackhmmer(msa_tool.MsaTool):
       )
 
       with open(output_sto_path) as f:
-        output_sto_str = f.read()
-      a3m = parsers.convert_stockholm_to_a3m(
-          output_sto_str, max_sequences=self.max_sequences
-      )
+        a3m = parsers.convert_stockholm_to_a3m(
+            f, max_sequences=self.max_sequences
+        )
 
     return msa_tool.MsaToolResult(
         target_sequence=target_sequence, a3m=a3m, e_value=self.e_value
diff --git a/src/alphafold3/data/tools/nhmmer.py b/src/alphafold3/data/tools/nhmmer.py
index e792699..91732ba 100644
--- a/src/alphafold3/data/tools/nhmmer.py
+++ b/src/alphafold3/data/tools/nhmmer.py
@@ -132,35 +132,34 @@ class Nhmmer(msa_tool.MsaTool):
           log_on_process_error=True,
       )
 
-      with open(output_sto_path) as f:
-        sto_out = f.read()
-
-    if sto_out:
-      a3m_out = parsers.convert_stockholm_to_a3m(
-          sto_out, max_sequences=self._max_sequences - 1  # Query not included.
-      )
-      # Nhmmer hits are generally shorter than the query sequence. To get an MSA
-      # of width equal to the query sequence, align hits to the query profile.
-      logging.info('Aligning output a3m of size %d bytes', len(a3m_out))
-
-      aligner = hmmalign.Hmmalign(self._hmmalign_binary_path)
-      target_sequence_fasta = f'>query\n{target_sequence}\n'
-      profile_builder = hmmbuild.Hmmbuild(
-          binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet
-      )
-      profile = profile_builder.build_profile_from_a3m(target_sequence_fasta)
-      a3m_out = aligner.align_sequences_to_profile(
-          profile=profile, sequences_a3m=a3m_out
-      )
-      a3m_out = ''.join([target_sequence_fasta, a3m_out])
-
-      # Parse the output a3m to remove line breaks.
-      a3m = [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)]
-      a3m = '\n'.join(a3m)
-    else:
-      # Nhmmer returns an empty file if there are no hits.
-      # In this case return only the query sequence.
-      a3m = f'>query\n{target_sequence}'
+      if os.path.getsize(output_sto_path) > 0:
+        with open(output_sto_path) as f:
+          a3m_out = parsers.convert_stockholm_to_a3m(
+              f, max_sequences=self._max_sequences - 1  # Query not included.
+          )
+        # Nhmmer hits are generally shorter than the query sequence. To get MSA
+        # of width equal to the query sequence, align hits to the query profile.
+        logging.info('Aligning output a3m of size %d bytes', len(a3m_out))
+
+        aligner = hmmalign.Hmmalign(self._hmmalign_binary_path)
+        target_sequence_fasta = f'>query\n{target_sequence}\n'
+        profile_builder = hmmbuild.Hmmbuild(
+            binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet
+        )
+        profile = profile_builder.build_profile_from_a3m(target_sequence_fasta)
+        a3m_out = aligner.align_sequences_to_profile(
+            profile=profile, sequences_a3m=a3m_out
+        )
+        a3m_out = ''.join([target_sequence_fasta, a3m_out])
+
+        # Parse the output a3m to remove line breaks.
+        a3m = '\n'.join(
+            [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)]
+        )
+      else:
+        # Nhmmer returns an empty file if there are no hits.
+        # In this case return only the query sequence.
+        a3m = f'>query\n{target_sequence}'
 
     return msa_tool.MsaToolResult(
         target_sequence=target_sequence, e_value=self._e_value, a3m=a3m
-- 
GitLab