Uniprot
提供了蛋白质的很多信息。
DrissionPage
比起Selenium更强大的爬虫软件,Selenium已经被反爬而且配置困难。
Code
- 输入是蛋白质的Uniprot ID TXT文件,一个id一行
- 输出为CSV
from tkinter import W
from DrissionPage import Chromium, ChromiumOptionsimport csvdef write_csv(data):# 打开文件进行写入with open('output.csv', 'w', newline='', encoding='utf-8') as file:writer = csv.writer(file)# 写入表头writer.writerow(['id', 'DNA binding'])# 写入数据行for row in data:# 将列表转换为字符串,例如 "1,2,3,4"list_str = ','.join(map(str, row[1]))writer.writerow([row[0], list_str])# 创建配置对象(默认从 ini 文件中读取配置)
co = ChromiumOptions()
# 设置不加载图片、静音
co.set_browser_path(r'C:\Chrome_123.0.6312.59_x64.Green\App\chrome.exe')browser = Chromium(addr_or_opts=co)
tab = browser.latest_tab ids=open('PDB2272_P.txt','r').readlines()
open_file=open('PDB2272_P_binding.txt','w+')data = [# [1, [1, 2, 3, 4]],# [2, [5, 6, 7]],# [3, [8, 9, 10, 11, 12]]
]for id in ids:id=id.replace('\n','')_list=[]tab.get(f'https://www.uniprot.org/uniprotkb/{id}/entry') eles = tab.eles('@tag()=tr')for ele in eles:if ele.text.find('DNA binding') != -1:if ele.text.startswith('+'):txt=ele.text.split('\n')[0]print(id,txt.split('\t')[3])_range=txt.split('\t')[3].split('-')if len(_range)==1:_list.append(int(_range[0]))else:_list.extend(list(range(int(_range[0]),int(_range[1])+1)))if len(_list)>0:item=[id,_list]data.append(item)write_csv(data)