From e60f750b6289129f5aed5e57a40ce9d3651b024e Mon Sep 17 00:00:00 2001 From: Augustin Zidek <augustinzidek@google.com> Date: Tue, 19 Nov 2024 13:39:36 +0000 Subject: [PATCH] Don't read more Stockholm sequences than necessary when converting to a3m PiperOrigin-RevId: 697979915 --- src/alphafold3/data/tools/jackhmmer.py | 15 +++------------ src/alphafold3/data/tools/nhmmer.py | 14 +++++--------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/src/alphafold3/data/tools/jackhmmer.py b/src/alphafold3/data/tools/jackhmmer.py index 2b0fd36..d1771a3 100644 --- a/src/alphafold3/data/tools/jackhmmer.py +++ b/src/alphafold3/data/tools/jackhmmer.py @@ -124,20 +124,11 @@ class Jackhmmer(msa_tool.MsaTool): log_on_process_error=True, ) - # Parse sequences (to remove line breaks). with open(output_sto_path) as f: output_sto_str = f.read() - output_a3m_str = parsers.convert_stockholm_to_a3m(output_sto_str) - a3m = [] - for i, (seq, name) in enumerate( - parsers.lazy_parse_fasta_string(output_a3m_str) - ): - if i == self.max_sequences: - # Apply the maximum MSA depth limit. - logging.info('Limiting MSA depth to %d', self.max_sequences) - break - a3m.append(f'>{name}\n{seq}') - a3m = '\n'.join(a3m) + a3m = parsers.convert_stockholm_to_a3m( + output_sto_str, max_sequences=self.max_sequences + ) return msa_tool.MsaToolResult( target_sequence=target_sequence, a3m=a3m, e_value=self.e_value diff --git a/src/alphafold3/data/tools/nhmmer.py b/src/alphafold3/data/tools/nhmmer.py index a4b4089..e792699 100644 --- a/src/alphafold3/data/tools/nhmmer.py +++ b/src/alphafold3/data/tools/nhmmer.py @@ -136,7 +136,9 @@ class Nhmmer(msa_tool.MsaTool): sto_out = f.read() if sto_out: - a3m_out = parsers.convert_stockholm_to_a3m(sto_out) + a3m_out = parsers.convert_stockholm_to_a3m( + sto_out, max_sequences=self._max_sequences - 1 # Query not included. + ) # Nhmmer hits are generally shorter than the query sequence. To get an MSA # of width equal to the query sequence, align hits to the query profile. logging.info('Aligning output a3m of size %d bytes', len(a3m_out)) @@ -152,14 +154,8 @@ class Nhmmer(msa_tool.MsaTool): ) a3m_out = ''.join([target_sequence_fasta, a3m_out]) - # Parse sequences (to remove line breaks). - a3m = [] - for i, (seq, name) in enumerate(parsers.lazy_parse_fasta_string(a3m_out)): - if i == self._max_sequences: - # Apply the maximum MSA depth limit. - logging.info('Limiting MSA depth to %d', self._max_sequences) - break - a3m.append(f'>{name}\n{seq}') + # Parse the output a3m to remove line breaks. + a3m = [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)] a3m = '\n'.join(a3m) else: # Nhmmer returns an empty file if there are no hits. -- GitLab