diff --git a/src/alphafold3/data/parsers.py b/src/alphafold3/data/parsers.py index 183d4ff43dbb4ff6894251f4a06709a5289922aa..8389810c44f8b36375ecd51eced86240dde7cd43 100644 --- a/src/alphafold3/data/parsers.py +++ b/src/alphafold3/data/parsers.py @@ -11,7 +11,7 @@ """Functions for parsing various file formats.""" from collections.abc import Iterable, Sequence -from typing import TypeAlias +from typing import IO, TypeAlias from alphafold3.cpp import fasta_iterator from alphafold3.cpp import msa_conversion @@ -102,7 +102,7 @@ def convert_a3m_to_stockholm(a3m: str, max_seqs: int | None = None) -> str: def convert_stockholm_to_a3m( - stockholm_format: str, + stockholm: IO[str], max_sequences: int | None = None, remove_first_row_gaps: bool = True, linewidth: int | None = None, @@ -115,7 +115,7 @@ def convert_stockholm_to_a3m( if linewidth is not None and linewidth <= 0: raise ValueError('linewidth must be > 0 or None') - for line in stockholm_format.splitlines(): + for line in stockholm: reached_max_sequences = max_sequences and len(sequences) >= max_sequences line = line.strip() # Ignore blank lines, markup and end symbols - remainder are alignment @@ -129,7 +129,9 @@ def convert_stockholm_to_a3m( sequences[seqname] = '' sequences[seqname] += aligned_seq - for line in stockholm_format.splitlines(): + stockholm.seek(0) + for line in stockholm: + line = line.strip() if line[:4] == '#=GS': # Description row - example format is: # #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ... diff --git a/src/alphafold3/data/tools/hmmsearch.py b/src/alphafold3/data/tools/hmmsearch.py index adab18d29625cbc1b82bde47e5c2fd07657dfcf6..a9bbde6885e2c5e9f5b1e7c5ff42f356228b45a2 100644 --- a/src/alphafold3/data/tools/hmmsearch.py +++ b/src/alphafold3/data/tools/hmmsearch.py @@ -125,11 +125,9 @@ class Hmmsearch(object): ) with open(sto_out_path) as f: - sto_out = f.read() - - a3m_out = parsers.convert_stockholm_to_a3m( - sto_out, remove_first_row_gaps=False, linewidth=60 - ) + a3m_out = parsers.convert_stockholm_to_a3m( + f, remove_first_row_gaps=False, linewidth=60 + ) return a3m_out diff --git a/src/alphafold3/data/tools/jackhmmer.py b/src/alphafold3/data/tools/jackhmmer.py index d1771a3f2776a2227109878dbacea5a1b64fa1cd..3603450ca134d5d4e3a5fd0f0900ab3a34faa4e1 100644 --- a/src/alphafold3/data/tools/jackhmmer.py +++ b/src/alphafold3/data/tools/jackhmmer.py @@ -125,10 +125,9 @@ class Jackhmmer(msa_tool.MsaTool): ) with open(output_sto_path) as f: - output_sto_str = f.read() - a3m = parsers.convert_stockholm_to_a3m( - output_sto_str, max_sequences=self.max_sequences - ) + a3m = parsers.convert_stockholm_to_a3m( + f, max_sequences=self.max_sequences + ) return msa_tool.MsaToolResult( target_sequence=target_sequence, a3m=a3m, e_value=self.e_value diff --git a/src/alphafold3/data/tools/nhmmer.py b/src/alphafold3/data/tools/nhmmer.py index e792699d908b3e53c251f45f153d8a96e4253e00..91732babc2620ffc6e88ba861c2b528a25a0b46a 100644 --- a/src/alphafold3/data/tools/nhmmer.py +++ b/src/alphafold3/data/tools/nhmmer.py @@ -132,35 +132,34 @@ class Nhmmer(msa_tool.MsaTool): log_on_process_error=True, ) - with open(output_sto_path) as f: - sto_out = f.read() - - if sto_out: - a3m_out = parsers.convert_stockholm_to_a3m( - sto_out, max_sequences=self._max_sequences - 1 # Query not included. - ) - # Nhmmer hits are generally shorter than the query sequence. To get an MSA - # of width equal to the query sequence, align hits to the query profile. - logging.info('Aligning output a3m of size %d bytes', len(a3m_out)) - - aligner = hmmalign.Hmmalign(self._hmmalign_binary_path) - target_sequence_fasta = f'>query\n{target_sequence}\n' - profile_builder = hmmbuild.Hmmbuild( - binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet - ) - profile = profile_builder.build_profile_from_a3m(target_sequence_fasta) - a3m_out = aligner.align_sequences_to_profile( - profile=profile, sequences_a3m=a3m_out - ) - a3m_out = ''.join([target_sequence_fasta, a3m_out]) - - # Parse the output a3m to remove line breaks. - a3m = [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)] - a3m = '\n'.join(a3m) - else: - # Nhmmer returns an empty file if there are no hits. - # In this case return only the query sequence. - a3m = f'>query\n{target_sequence}' + if os.path.getsize(output_sto_path) > 0: + with open(output_sto_path) as f: + a3m_out = parsers.convert_stockholm_to_a3m( + f, max_sequences=self._max_sequences - 1 # Query not included. + ) + # Nhmmer hits are generally shorter than the query sequence. To get MSA + # of width equal to the query sequence, align hits to the query profile. + logging.info('Aligning output a3m of size %d bytes', len(a3m_out)) + + aligner = hmmalign.Hmmalign(self._hmmalign_binary_path) + target_sequence_fasta = f'>query\n{target_sequence}\n' + profile_builder = hmmbuild.Hmmbuild( + binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet + ) + profile = profile_builder.build_profile_from_a3m(target_sequence_fasta) + a3m_out = aligner.align_sequences_to_profile( + profile=profile, sequences_a3m=a3m_out + ) + a3m_out = ''.join([target_sequence_fasta, a3m_out]) + + # Parse the output a3m to remove line breaks. + a3m = '\n'.join( + [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)] + ) + else: + # Nhmmer returns an empty file if there are no hits. + # In this case return only the query sequence. + a3m = f'>query\n{target_sequence}' return msa_tool.MsaToolResult( target_sequence=target_sequence, e_value=self._e_value, a3m=a3m