From cb2133d0fdaa66505051acc077f7b17617c2e608 Mon Sep 17 00:00:00 2001 From: Augustin Zidek <augustinzidek@google.com> Date: Tue, 19 Nov 2024 15:12:23 +0000 Subject: [PATCH] Convert Stockholm to A3M without reading the whole file into RAM PiperOrigin-RevId: 698003594 --- src/alphafold3/data/parsers.py | 10 +++-- src/alphafold3/data/tools/hmmsearch.py | 8 ++-- src/alphafold3/data/tools/jackhmmer.py | 7 ++-- src/alphafold3/data/tools/nhmmer.py | 57 +++++++++++++------------- 4 files changed, 40 insertions(+), 42 deletions(-) diff --git a/src/alphafold3/data/parsers.py b/src/alphafold3/data/parsers.py index 183d4ff..8389810 100644 --- a/src/alphafold3/data/parsers.py +++ b/src/alphafold3/data/parsers.py @@ -11,7 +11,7 @@ """Functions for parsing various file formats.""" from collections.abc import Iterable, Sequence -from typing import TypeAlias +from typing import IO, TypeAlias from alphafold3.cpp import fasta_iterator from alphafold3.cpp import msa_conversion @@ -102,7 +102,7 @@ def convert_a3m_to_stockholm(a3m: str, max_seqs: int | None = None) -> str: def convert_stockholm_to_a3m( - stockholm_format: str, + stockholm: IO[str], max_sequences: int | None = None, remove_first_row_gaps: bool = True, linewidth: int | None = None, @@ -115,7 +115,7 @@ def convert_stockholm_to_a3m( if linewidth is not None and linewidth <= 0: raise ValueError('linewidth must be > 0 or None') - for line in stockholm_format.splitlines(): + for line in stockholm: reached_max_sequences = max_sequences and len(sequences) >= max_sequences line = line.strip() # Ignore blank lines, markup and end symbols - remainder are alignment @@ -129,7 +129,9 @@ def convert_stockholm_to_a3m( sequences[seqname] = '' sequences[seqname] += aligned_seq - for line in stockholm_format.splitlines(): + stockholm.seek(0) + for line in stockholm: + line = line.strip() if line[:4] == '#=GS': # Description row - example format is: # #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ... diff --git a/src/alphafold3/data/tools/hmmsearch.py b/src/alphafold3/data/tools/hmmsearch.py index adab18d..a9bbde6 100644 --- a/src/alphafold3/data/tools/hmmsearch.py +++ b/src/alphafold3/data/tools/hmmsearch.py @@ -125,11 +125,9 @@ class Hmmsearch(object): ) with open(sto_out_path) as f: - sto_out = f.read() - - a3m_out = parsers.convert_stockholm_to_a3m( - sto_out, remove_first_row_gaps=False, linewidth=60 - ) + a3m_out = parsers.convert_stockholm_to_a3m( + f, remove_first_row_gaps=False, linewidth=60 + ) return a3m_out diff --git a/src/alphafold3/data/tools/jackhmmer.py b/src/alphafold3/data/tools/jackhmmer.py index d1771a3..3603450 100644 --- a/src/alphafold3/data/tools/jackhmmer.py +++ b/src/alphafold3/data/tools/jackhmmer.py @@ -125,10 +125,9 @@ class Jackhmmer(msa_tool.MsaTool): ) with open(output_sto_path) as f: - output_sto_str = f.read() - a3m = parsers.convert_stockholm_to_a3m( - output_sto_str, max_sequences=self.max_sequences - ) + a3m = parsers.convert_stockholm_to_a3m( + f, max_sequences=self.max_sequences + ) return msa_tool.MsaToolResult( target_sequence=target_sequence, a3m=a3m, e_value=self.e_value diff --git a/src/alphafold3/data/tools/nhmmer.py b/src/alphafold3/data/tools/nhmmer.py index e792699..91732ba 100644 --- a/src/alphafold3/data/tools/nhmmer.py +++ b/src/alphafold3/data/tools/nhmmer.py @@ -132,35 +132,34 @@ class Nhmmer(msa_tool.MsaTool): log_on_process_error=True, ) - with open(output_sto_path) as f: - sto_out = f.read() - - if sto_out: - a3m_out = parsers.convert_stockholm_to_a3m( - sto_out, max_sequences=self._max_sequences - 1 # Query not included. - ) - # Nhmmer hits are generally shorter than the query sequence. To get an MSA - # of width equal to the query sequence, align hits to the query profile. - logging.info('Aligning output a3m of size %d bytes', len(a3m_out)) - - aligner = hmmalign.Hmmalign(self._hmmalign_binary_path) - target_sequence_fasta = f'>query\n{target_sequence}\n' - profile_builder = hmmbuild.Hmmbuild( - binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet - ) - profile = profile_builder.build_profile_from_a3m(target_sequence_fasta) - a3m_out = aligner.align_sequences_to_profile( - profile=profile, sequences_a3m=a3m_out - ) - a3m_out = ''.join([target_sequence_fasta, a3m_out]) - - # Parse the output a3m to remove line breaks. - a3m = [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)] - a3m = '\n'.join(a3m) - else: - # Nhmmer returns an empty file if there are no hits. - # In this case return only the query sequence. - a3m = f'>query\n{target_sequence}' + if os.path.getsize(output_sto_path) > 0: + with open(output_sto_path) as f: + a3m_out = parsers.convert_stockholm_to_a3m( + f, max_sequences=self._max_sequences - 1 # Query not included. + ) + # Nhmmer hits are generally shorter than the query sequence. To get MSA + # of width equal to the query sequence, align hits to the query profile. + logging.info('Aligning output a3m of size %d bytes', len(a3m_out)) + + aligner = hmmalign.Hmmalign(self._hmmalign_binary_path) + target_sequence_fasta = f'>query\n{target_sequence}\n' + profile_builder = hmmbuild.Hmmbuild( + binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet + ) + profile = profile_builder.build_profile_from_a3m(target_sequence_fasta) + a3m_out = aligner.align_sequences_to_profile( + profile=profile, sequences_a3m=a3m_out + ) + a3m_out = ''.join([target_sequence_fasta, a3m_out]) + + # Parse the output a3m to remove line breaks. + a3m = '\n'.join( + [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)] + ) + else: + # Nhmmer returns an empty file if there are no hits. + # In this case return only the query sequence. + a3m = f'>query\n{target_sequence}' return msa_tool.MsaToolResult( target_sequence=target_sequence, e_value=self._e_value, a3m=a3m -- GitLab