Source code for geckopy.experimental.molecular_weights
# Copyright 2021 Ginkgo Bioworks
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build dataframe with protein reactions identifiers, Uniprot IDs and MW."""
import re
import urllib.parse
import urllib.request
from typing import Dict, Optional
import pandas as pd
from geckopy.model import Model
__all__ = ["get_uniprot", "parse_mw", "extract_proteins"]
DEFAULT_PARAMS = {"from": "ACC+ID", "to": "ACC", "format": "txt", "query": ""}
URL = "https://www.uniprot.org/uploadlists/"
pat_mw = re.compile(r"\nSQ SEQUENCE.+ (\d+) MW;")
UNIPROT_PATTERN = re.compile(
r"(?:prot_)?([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})"
)
[docs]def get_uniprot(query: str) -> str:
"""Get uniprot information corresponding to a query.
Parameters
----------
query: str
an UNIPROT ID(s), separated by spaces
"""
# WARNING: side effects on DEFAULT_PARAMS
params = DEFAULT_PARAMS
params["query"] = query
data = urllib.parse.urlencode(params)
data = data.encode("utf-8")
req = urllib.request.Request(URL, data)
with urllib.request.urlopen(req) as f:
response = f.read()
return response.decode("utf-8")
[docs]def parse_mw(uniprot_info: str) -> str:
"""Get all MW of uniprot text (Dalton)."""
return pat_mw.findall(uniprot_info)
def _get_all_proteins(model: Model, key_fn) -> Dict:
"""Generate a set of dict of Uniprot ID: reaction id from a `model`."""
return {key_fn(prot): prot.id for prot in model.proteins}