Skip to content
Snippets Groups Projects
Commit e60f750b authored by Augustin Zidek's avatar Augustin Zidek
Browse files

Don't read more Stockholm sequences than necessary when converting to a3m

PiperOrigin-RevId: 697979915
parent 35996125
No related branches found
No related tags found
1 merge request!1Cloned AlphaFold 3 repo into this one
...@@ -124,20 +124,11 @@ class Jackhmmer(msa_tool.MsaTool): ...@@ -124,20 +124,11 @@ class Jackhmmer(msa_tool.MsaTool):
log_on_process_error=True, log_on_process_error=True,
) )
# Parse sequences (to remove line breaks).
with open(output_sto_path) as f: with open(output_sto_path) as f:
output_sto_str = f.read() output_sto_str = f.read()
output_a3m_str = parsers.convert_stockholm_to_a3m(output_sto_str) a3m = parsers.convert_stockholm_to_a3m(
a3m = [] output_sto_str, max_sequences=self.max_sequences
for i, (seq, name) in enumerate( )
parsers.lazy_parse_fasta_string(output_a3m_str)
):
if i == self.max_sequences:
# Apply the maximum MSA depth limit.
logging.info('Limiting MSA depth to %d', self.max_sequences)
break
a3m.append(f'>{name}\n{seq}')
a3m = '\n'.join(a3m)
return msa_tool.MsaToolResult( return msa_tool.MsaToolResult(
target_sequence=target_sequence, a3m=a3m, e_value=self.e_value target_sequence=target_sequence, a3m=a3m, e_value=self.e_value
......
...@@ -136,7 +136,9 @@ class Nhmmer(msa_tool.MsaTool): ...@@ -136,7 +136,9 @@ class Nhmmer(msa_tool.MsaTool):
sto_out = f.read() sto_out = f.read()
if sto_out: if sto_out:
a3m_out = parsers.convert_stockholm_to_a3m(sto_out) a3m_out = parsers.convert_stockholm_to_a3m(
sto_out, max_sequences=self._max_sequences - 1 # Query not included.
)
# Nhmmer hits are generally shorter than the query sequence. To get an MSA # Nhmmer hits are generally shorter than the query sequence. To get an MSA
# of width equal to the query sequence, align hits to the query profile. # of width equal to the query sequence, align hits to the query profile.
logging.info('Aligning output a3m of size %d bytes', len(a3m_out)) logging.info('Aligning output a3m of size %d bytes', len(a3m_out))
...@@ -152,14 +154,8 @@ class Nhmmer(msa_tool.MsaTool): ...@@ -152,14 +154,8 @@ class Nhmmer(msa_tool.MsaTool):
) )
a3m_out = ''.join([target_sequence_fasta, a3m_out]) a3m_out = ''.join([target_sequence_fasta, a3m_out])
# Parse sequences (to remove line breaks). # Parse the output a3m to remove line breaks.
a3m = [] a3m = [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)]
for i, (seq, name) in enumerate(parsers.lazy_parse_fasta_string(a3m_out)):
if i == self._max_sequences:
# Apply the maximum MSA depth limit.
logging.info('Limiting MSA depth to %d', self._max_sequences)
break
a3m.append(f'>{name}\n{seq}')
a3m = '\n'.join(a3m) a3m = '\n'.join(a3m)
else: else:
# Nhmmer returns an empty file if there are no hits. # Nhmmer returns an empty file if there are no hits.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment