Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Copyright 2024 DeepMind Technologies Limited
#
# AlphaFold 3 source code is licensed under CC BY-NC-SA 4.0. To view a copy of
# this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/
#
# To request access to the AlphaFold 3 model parameters, follow the process set
# out at https://github.com/google-deepmind/alphafold3. You may only use these
# if received directly from Google. Use is subject to terms of use available at
# https://github.com/google-deepmind/alphafold3/blob/main/WEIGHTS_TERMS_OF_USE.md
"""Library to run Nhmmer from Python."""
import os
import pathlib
import tempfile
from typing import Final
from absl import logging
from alphafold3.data import parsers
from alphafold3.data.tools import hmmalign
from alphafold3.data.tools import hmmbuild
from alphafold3.data.tools import msa_tool
from alphafold3.data.tools import subprocess_utils
_SHORT_SEQUENCE_CUTOFF: Final[int] = 50
class Nhmmer(msa_tool.MsaTool):
"""Python wrapper of the Nhmmer binary."""
def __init__(
self,
binary_path: str,
hmmalign_binary_path: str,
hmmbuild_binary_path: str,
database_path: str,
n_cpu: int = 8,
e_value: float = 1e-3,
max_sequences: int = 5000,
filter_f3: float = 1e-5,
alphabet: str | None = None,
strand: str | None = None,
):
"""Initializes the Python Nhmmer wrapper.
Args:
binary_path: Path to the Nhmmer binary.
hmmalign_binary_path: Path to the Hmmalign binary.
hmmbuild_binary_path: Path to the Hmmbuild binary.
database_path: MSA database path to search against. This can be either a
FASTA (slow) or HMMERDB produced from the FASTA using the makehmmerdb
binary. The HMMERDB is ~10x faster but experimental.
n_cpu: The number of CPUs to give Nhmmer.
e_value: The E-value, see Nhmmer docs for more details. Will be
overwritten if bit_score is set.
max_sequences: Maximum number of sequences to return in the MSA.
filter_f3: Forward pre-filter, set to >1.0 to turn off.
alphabet: The alphabet to assert when building a profile with hmmbuild.
This must be 'rna', 'dna', or None.
strand: "watson" searches query sequence, "crick" searches
reverse-compliment and default is None which means searching for both.
Raises:
RuntimeError: If Nhmmer binary not found within the path.
"""
self._binary_path = binary_path
self._hmmalign_binary_path = hmmalign_binary_path
self._hmmbuild_binary_path = hmmbuild_binary_path
self._db_path = database_path
subprocess_utils.check_binary_exists(path=self._binary_path, name='Nhmmer')
if strand and strand not in {'watson', 'crick'}:
raise ValueError(f'Invalid {strand=}. only "watson" or "crick" supported')
if alphabet and alphabet not in {'rna', 'dna'}:
raise ValueError(f'Invalid {alphabet=}, only "rna" or "dna" supported')
self._e_value = e_value
self._n_cpu = n_cpu
self._max_sequences = max_sequences
self._filter_f3 = filter_f3
self._alphabet = alphabet
self._strand = strand
def query(self, target_sequence: str) -> msa_tool.MsaToolResult:
"""Query the database using Nhmmer."""
logging.info('Query sequence: %s', target_sequence)
with tempfile.TemporaryDirectory() as query_tmp_dir:
input_a3m_path = os.path.join(query_tmp_dir, 'query.a3m')
output_sto_path = os.path.join(query_tmp_dir, 'output.sto')
pathlib.Path(output_sto_path).touch()
subprocess_utils.create_query_fasta_file(
sequence=target_sequence, path=input_a3m_path
)
cmd_flags = [
*('-o', '/dev/null'), # Don't pollute stdout with nhmmer output.
'--noali', # Don't include the alignment in stdout.
*('--cpu', str(self._n_cpu)),
]
cmd_flags.extend(['-E', str(self._e_value)])
if self._alphabet:
cmd_flags.extend([f'--{self._alphabet}'])
if self._strand is not None:
cmd_flags.extend([f'--{self._strand}'])
cmd_flags.extend(['-A', output_sto_path])
# As recommend by RNAcentral for short sequences.
if (
self._alphabet == 'rna'
and len(target_sequence) < _SHORT_SEQUENCE_CUTOFF
):
cmd_flags.extend(['--F3', str(0.02)])
else:
cmd_flags.extend(['--F3', str(self._filter_f3)])
# The input A3M and the db are the last two arguments.
cmd_flags.extend((input_a3m_path, self._db_path))
cmd = [self._binary_path, *cmd_flags]
subprocess_utils.run(
cmd=cmd,
cmd_name='Nhmmer',
log_stdout=False,
log_stderr=True,
log_on_process_error=True,
)
with open(output_sto_path) as f:
sto_out = f.read()
if sto_out:
a3m_out = parsers.convert_stockholm_to_a3m(
sto_out, max_sequences=self._max_sequences - 1 # Query not included.
)
# Nhmmer hits are generally shorter than the query sequence. To get an MSA
# of width equal to the query sequence, align hits to the query profile.
logging.info('Aligning output a3m of size %d bytes', len(a3m_out))
aligner = hmmalign.Hmmalign(self._hmmalign_binary_path)
target_sequence_fasta = f'>query\n{target_sequence}\n'
profile_builder = hmmbuild.Hmmbuild(
binary_path=self._hmmbuild_binary_path, alphabet=self._alphabet
)
profile = profile_builder.build_profile_from_a3m(target_sequence_fasta)
a3m_out = aligner.align_sequences_to_profile(
profile=profile, sequences_a3m=a3m_out
)
a3m_out = ''.join([target_sequence_fasta, a3m_out])
# Parse the output a3m to remove line breaks.
a3m = [f'>{n}\n{s}' for s, n in parsers.lazy_parse_fasta_string(a3m_out)]
a3m = '\n'.join(a3m)
else:
# Nhmmer returns an empty file if there are no hits.
# In this case return only the query sequence.
a3m = f'>query\n{target_sequence}'
return msa_tool.MsaToolResult(
target_sequence=target_sequence, e_value=self._e_value, a3m=a3m