Uniprot 데이터를 프로그래밍으로 접근하는 방법에 대한 강의
Programmatic access to UniProt - course - Colaboratory (google.com)
새로운 웹사이트 ( 2월 기준 ) 이 나왔고, 이는 REST API 를 사용한다
import requests, sys, json
# Documentation: https://www.ebi.ac.uk/proteins/api/doc/
PROTEINS_API = "https://www.ebi.ac.uk/proteins/api"
# not used in this session
# Documentation: https://rest.uniprot.org/beta/docs/
WEBSITE_API = "https://rest.uniprot.org/beta"
# Helper function to download data
def get_url(url, **kwargs):
response = requests.get(url, **kwargs);
if not response.ok:
print(response.text)
response.raise_for_status()
sys.exit()
return response
Complex search 진행
# stream good for simplicity (no pagination), but...
# - harder to follow progress
# - harder to resume on failure
# - not sorted by score
r = get_url(f"{WEBSITE_API}/uniprotkb/stream?query=(gene:MTM1) AND (taxonomy_id:9443)")
data = r.json()
total = len(data["results"])
print(data)
print(f"total: {total}")
얻어진 정보는 여러가지 Format 으로 얻을 수 있음
## No format defined, defaults to JSON
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&size=1")
## Specify a format, explicitely in the URL
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&size=1&format=xml")
## Specify a format, using request headers
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&size=1", headers={"Accept": "application/xml"})
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&format=list")
# r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&format=fasta")
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&format=tsv")
print(r.text)
이런 식으로 column 정보를 조절하는 것도 가능함
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9443)&fields=id,accession,length,cc_catalytic_activity&format=tsv")
print(r.text)
단일 정보를 검색하는 것도 가능
# all of the entry
r = get_url(f"{WEBSITE_API}/uniprotkb/Q13496")
# only the catalytic activity comments
# r = get_url(f"{WEBSITE_API}/uniprotkb/Q13496?fields=cc_catalytic_activity")
print(json.dumps(r.json(), indent=2))
다수 검색결과에 따른 그들의 FASTA 가져오기
# get list of accessions
# (note that here I changed taxonomy restriction from primates to human)
r = get_url(f"{WEBSITE_API}/uniprotkb/search?query=(gene:MTM1) AND (taxonomy_id:9606)&format=list")
accessions = r.text.replace("\n", ",")
print(accessions)
# get the natural variants information from the accessions endpoint
# r = get_url(f"{WEBSITE_API}/uniprotkb/accessions?accessions={accessions}&fields=ft_variant,organism_name")
# print(json.dumps(r.json(), indent=2))
# note that there is another variation endpoint in the Proteins API with more data, see https://www.ebi.ac.uk/proteins/api/doc/#/variation
# r = get_url(f"{PROTEINS_API}/variation?accession={accessions}")
# print(json.dumps(r.json(), indent=2))
# get FASTA of these entries
r = get_url(f"{WEBSITE_API}/uniprotkb/accessions?accessions={accessions}&format=fasta")
fasta = r.text
print(fasta)
놀라운 것은 BLAST 도 제공해준다는것
taxid 는 원하는 종을 comma-separated format 으로 사용해주면 됨
# get FASTA file
r = get_url(f"{WEBSITE_API}/uniprotkb/Q13496?format=fasta")
print(r.text)
# submit blast job
r = requests.post("https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/run", data={
"email": "example@example.com",
"program": "blastp",
"matrix": "BLOSUM62",
"alignments": 250,
"scores": 250,
"exp": 10,
"filter": "F",
"gapalign": "true",
"stype": "protein",
"database": "uniprotkb_refprotswissprot",
"taxids": "40674",
"sequence": r.text,
})
# documentation here https://www.ebi.ac.uk/seqdb/confluence/pages/viewpage.action?pageId=94147939#NCBIBLAST+HelpandDocumentation-RESTAPI
# job_id = r.text
# print(job_id)
# # get job status
# r = get_url(f"https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/status/{job_id}")
# print(r.text)
ID Mapping 도 할 수 있음
# Search "parkin" through all mammalia
r = get_url(f"{WEBSITE_API}/uniprotkb/stream?query=parkin AND (taxonomy_id:40674)&format=list")
accessions = r.text.replace("\n", ",").strip()
print("accessions:", accessions)
# Send job to ID mapping endpoint
r = requests.post(f"{WEBSITE_API}/idmapping/run", data={"from": "UniProtKB_AC-ID", "to": "ChEMBL", "ids": accessions})
job_id = r.json()['jobId']
print("job ID:", job_id)
r = get_url(f"{WEBSITE_API}/idmapping/status/{job_id}")
print(json.dumps(r.json(), indent=2))
댓글