Print a line of HTML, keeping the right format - python

I must print all the raw text of this HTML page.
Each line has this format:
ENSG00000001461'&nbsp';'&nbsp';'&nbsp';'&nbsp';ENST00000432012'&nbsp';'&nbsp';'&nbsp';'&nbsp';NIPAL3'&nbsp';'&nbsp';'&nbsp';'&nbsp';5'&nbsp';'&nbsp';'&nbsp';'&nbsp';1'&nbsp';'&nbsp';'&nbsp';'&nbsp';Forward'&nbsp';'&nbsp';'&nbsp';'&nbsp';NIPA-like domain containing 3 [Source:HGNC Symbol;Acc:HGNC:25233]<'br/'>
I want the following output:
ENSG00000001461 ENST00000432012 NIPAL3 5 1 Forward NIPA-like domain containing 3 [Source:HGNC Symbol;Acc:HGNC:25233]
But the output is only:
ENSG00000001461
This is my code:
import urllib
from bs4 import BeautifulSoup
species = ['HomoSapiens', 'MusMusculus', 'DrosophilaMelanogaster','CaenorhabditisElegans']
rna_target = ['mRNA', 'lincRNA', 'lncRNA']
db = ['MB21E78v2', 'MB19E65v2', 'MB16E62v1']
species_input = input("Selezionare Specie: ")
target_input = input("Selezionare tipo di RNA: ")
db_input = input("Selezionare DataBase: ")
check = 0
for i in range(len(species)):
if species_input == species[i]:
for j in range(len(rna_target)):
if target_input == rna_target[j]:
for k in range(len(db)):
if db_input == db[k]:
check = 1
if check == 1:
print("Dati Inseriti Correttamente!")
else:
print("Error: Dati inseriti in modo errato!")
exit()
url = urllib.request.urlopen("<https://cm.jefferson.edu/rna22/Precomputed/OptionController?>" +"species=" + species_input + "&type=" + target_input + "&version=" +db_input)
print(url.geturl())
identifier = []
seq_input = input("Digitare ID miRNA: ")
seq = ""
seq = seq_input.split()
print(seq)
for i in range(len(seq)):
identifier.append(seq[i] + "%20")
s = ""
string = s.join(identifier)
url_tab = urllib.request.urlopen("<https://cm.jefferson.edu/rna22/Precomputed/InputController?>"+"identifier=" string+"&minBasePairs=12&maxFoldingEnergy=-12&minSumHits=1&maxProb=.1&"+"version=" + db_input + "&species=" + species_input + "&type=" + target_input)
print(url_tab.geturl())
download = urllib.request.urlopen("
<http://cm.jefferson.edu/rna22/Precomputed/InputController?>download=ALL"+"&ident=" + string+"&minBasePairs=12&maxFoldingEnergy=-12&minSumHits=1&maxProb=.1&" +"version=" + db_input + "&species=" + species_input + "&type=" + target_input)
down_string = download.geturl()
print(down_string)
soup = BeautifulSoup(download, "html5lib")
for match in soup.findAll('br'):
match.unwrap()
s2 = soup
s1 = s2.body.extract()
print(s1.prettify(formatter=lambda s: s.strip(u'xa0')))

There is no notion of lines in the source, there is just one long line of text which you need to separate using newlines using the br tags.
If you have to parse the source, you can replace the br tags with newlines and just pull the text:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://cm.jefferson.edu/rna22/Precomputed/InputController?download=ALL&ident=hsa_miR_107%20hsa_miR_5011_5p%20hsa_miR_326&minBasePairs=12&maxFoldingEnergy=-12&minSumHits=1&maxProb=.1&version=MB21E78v2&species=HomoSapiens&type=mRNA")
soup = BeautifulSoup(r.content)
for b in soup.find_all("br"):
b.replace_with("\n")
print(soup.text)
Which will give you:
ENSG00000001461    ENST00000432012    NIPAL3    5    1    Forward    NIPA-like domain containing 3 [Source:HGNC Symbol;Acc:HGNC:25233]
ENSG00000001631    ENST00000340022    KRIT1    5    7    Reverse    KRIT1, ankyrin repeat containing [Source:HGNC Symbol;Acc:HGNC:1573]
ENSG00000001631    ENST00000394503    KRIT1    3    7    Reverse    KRIT1, ankyrin repeat containing [Source:HGNC Symbol;Acc:HGNC:1573]
ENSG00000001631    ENST00000394505    KRIT1    3    7    Reverse    KRIT1, ankyrin repeat containing [Source:HGNC Symbol;Acc:HGNC:1573]
ENSG00000001631    ENST00000394507    KRIT1    4    7    Reverse    KRIT1, ankyrin repeat containing [Source:HGNC Symbol;Acc:HGNC:1573]
ENSG00000001631    ENST00000412043    KRIT1    4    7    Reverse    KRIT1, ankyrin repeat containing [Source:HGNC Symbol;Acc:HGNC:1573]
ENSG00000002834    ENST00000318008    LASP1    6    17    Forward    LIM and SH3 protein 1 [Source:HGNC Symbol;Acc:HGNC:6513]
ENSG00000002834    ENST00000433206    LASP1    6    17    Forward    LIM and SH3 protein 1 [Source:HGNC Symbol;Acc:HGNC:6513]
ENSG00000002834    ENST00000435347    LASP1    5    17    Forward    LIM and SH3 protein 1 [Source:HGNC Symbol;Acc:HGNC:6513]
ENSG00000005381    ENST00000225275    MPO    5    17    Reverse    myeloperoxidase [Source:HGNC Symbol;Acc:HGNC:7218]
ENSG00000005889    ENST00000539115    ZFX    4    23 X    Forward    zinc finger protein, X-linked [Source:HGNC Symbol;Acc:HGNC:12869]
ENSG00000006432    ENST00000554752    MAP3K9    10    14    Reverse    mitogen-activated protein kinase kinase kinase 9 [Source:HGNC Symbol;Acc:HGNC:6861]
ENSG00000006432    ENST00000611979    MAP3K9    10    14    Reverse    mitogen-activated protein kinase kinase kinase 9 [Source:HGNC Symbol;Acc:HGNC:6861]
ENSG00000007216    ENST00000314669    SLC13A2    4    17    Forward    solute carrier family 13 (sodium-dependent dicarboxylate transporter), member 2 [Source:HGNC Symbol;Acc:HGNC:10917]
ENSG00000007216    ENST00000444914    SLC13A2    4    17    Forward    solute carrier family 13 (sodium-dependent dicarboxylate transporter), member 2 [Source:HGNC Symbol;Acc:HGNC:10917]
And a whole lot more of the same.

I tested your code and replaced my previous answer.
If you edit the following errors, your code seems to work.
Remove < from the urls
Remove EOL in line 42
Add a + between "identifiers=" and string
Here are some of the lines of the output I get:
ENSG00000272325    ENST00000607016    NUDT3    4    6    Reverse    nudix (nucleoside diphosphate linked moiety X)-type motif 3 [Source:HGNC Symbol;Acc:HGNC:8050]
ENSG00000272980    ENST00000400926    CCR6    5    6    Forward    chemokine (C-C motif) receptor 6 [Source:HGNC Symbol;Acc:HGNC:1607]
ENSG00000274211    ENST00000612932    SOCS7    8    17    Forward    suppressor of cytokine signaling 7 [Source:HGNC Symbol;Acc:HGNC:29846]
ENSG00000274588    ENST00000611977    DGKK    4    23 X    Reverse    diacylglycerol kinase, kappa [Source:HGNC Symbol;Acc:HGNC:32395]
ENSG00000275004    ENST00000613655    ZNF280B    4    22    Reverse    zinc finger protein 280B [Source:HGNC Symbol;Acc:HGNC:23022]
ENSG00000275004    ENST00000619852    ZNF280B    4    22    Reverse    zinc finger protein 280B [Source:HGNC Symbol;Acc:HGNC:23022]
ENSG00000275832    ENST00000622683    ARHGAP23    6    17    Forward    Rho GTPase activating protein 23 [Source:HGNC Symbol;Acc:HGNC:29293]
ENSG00000277258    ENST00000616199    PCGF2    3    17    Reverse    polycomb group ring finger 2 [Source:HGNC Symbol;Acc:HGNC:12929]
ENSG00000278871    ENST00000623344    KDM5D    8    24 Y    Reverse    lysine (K)-specific demethylase 5D [Source:HGNC Symbol;Acc:HGNC:11115]
ENSG00000279096    ENST00000622918    AL356289.1    11    1    Forward    HCG1780467 {ECO:0000313|EMBL:EAX06861.1}; PRO0529 {ECO:0000313|EMBL:AAF16687.1} [Source:UniProtKB/TrEMBL;Acc:Q9UI23]

Related

Storing keyvalue as header and value text as rows using data frame in python using beautiful soup

for imo in imos:
...
...
keys_div= soup.find_all("div", {"class","col-4 keytext"})
values_div = soup.find_all("div",{"class","col-8 valuetext"})
for key, value in zip(keys_div, values_div):
print(key.text + ": " + value.text)
'......
Output:
Ship Name: MAERSK ADRIATIC
Shiptype: Chemical/Products Tanker
IMO/LR No.: 9636632
Gross: 23,297
Call Sign: 9V3388
Deadweight: 37,538
MMSI No.: 566429000
Year of Build: 2012
Flag: Singapore
Status: In Service/Commission
Operator: Handytankers K/S
Shipbuilder: Hyundai Mipo Dockyard Co Ltd
ShipType: Chemical/Products Tanker
Built: 2012
GT: 23,297
Deadweight: 37,538
Length Overall: 184.000
Length (BP): 176.000
Length (Reg): 177.460
Bulbous Bow: Yes
Breadth Extreme: 27.430
Breadth Moulded: 27.400
Draught: 11.500
Depth: 17.200
Keel To Mast Height: 46.900
Displacement: 46565
T/CM: 45.0
This is the output for one imo, i want to store this output in dataframe and write to csv, the csv will have the keytext as header and value text as rows for all the IMO's please help me on how to do it
All you have to do is add the results to a list and then output that list to a dataframe.
import pandas as pd
filepath = r"C\users\test\test_file.csv"
output_data = []
for imo in imos:
keys_div = [i.text for i in soup.find_all("div", {"class","col-4 keytext"})]
values_div = [i.text for i in soup.find_all("div",{"class","col-8 valuetext"})]
dict1 = dict(zip(keys_div, values_div))
output_data.append(dict1)
df = pd.DataFrame(output_data)
df.to_csv(filepath, index=False)

How to transform an affirmative sentence into a general question using Python Udapi?

I would like to trasnform some pretty simple affirmative sentences into general questions (the language of choise is Spanish). Consider the following example:
Esto es muy difícil. -> Es esto muy difícil?
So I just need to shift the position of subject and predicate (wherever they are).
Normally it can be done with the shift_before_node() method:
pron_node, aux_node = tree.descendants[0], tree.descendants[1]
aux_node.shift_before_node(pron_node)
However, if I want to automate the process (because subject and predicate will not always be in the same position) I need to create a cycle (See The Problem paragraph below) for each node of a tree, where it checks that if node's part of speech (upos) is a PRON or PROPN, and it is followed (not necessarily directly) by a node which is a VERB or AUX, it needs to shift the second node before the first one (like in the example above). But, I dont know how to implement it into cycle. Any suggestions?
Here is my code so far (done in Google Colab). I apologize for excluding some of the console text, otherwise it would be too lengthy:
Request to UDPipe server
import requests
response = requests.get("http://lindat.mff.cuni.cz/services/udpipe/api/models")
info = response.json()
info
for key, data in info["models"].items():
if "spanish" in key:
print(key, data)
params = {"tokenizer": "", "tagger": "", "parser": "", "model": "spanish-gsd-ud-2.6-200830"}
text = "Esto es muy difícil."
params["data"] = text
response = requests.get("http://lindat.mff.cuni.cz/services/udpipe/api/process", params)
json_response = response.json()
parse = json_response["result"]
print(parse)
Output #1 (print (parse)):
# generator = UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe
# udpipe_model = spanish-gsd-ud-2.6-200830
# udpipe_model_licence = CC BY-NC-SA
# newdoc
# newpar
# sent_id = 1
# text = Esto es muy difícil.
1 Esto este PRON _ Number=Sing|PronType=Dem 4 nsubj _ _
2 es ser AUX _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 4 cop _ _
3 muy mucho ADV _ _ 4 advmod _ _
4 difícil difícil ADJ _ Number=Sing 0 root _ SpaceAfter=No
5 . . PUNCT _ _ 4 punct _ SpaceAfter=No
Udapi Installation:
!pip install --upgrade git+https://github.com/udapi/udapi-python.git
import os
os.environ['PATH'] += ":" + os.path.join(os.environ['HOME'], ".local/bin")
from udapi.block.read.conllu import Conllu
from udapi.core.document import Document
from udapi.block.write.textmodetrees import TextModeTrees
from io import StringIO
Building a tree:
In my understanding a tree is a variable of a built in Udapi class, which is a structured version of a parse variable, and which contains all the information about each word of a sentence - its order (ord), given form (form), initial form (lemma), part of speech (upos) and so on:
tree = Conllu(filehandle=StringIO(parse)).read_tree()
writer = TextModeTrees(attributes="ord,form,lemma,upos,feats,deprel", layout="align")
writer.process_tree(tree)
Output #2 (writer.process_tree(tree)):
# sent_id = 1
# text = Esto es muy difícil.
─┮
│ ╭─╼ 1 Esto este PRON Number=Sing|PronType=Dem nsubj
│ ┢─╼ 2 es ser AUX Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin cop
│ ┢─╼ 3 muy mucho ADV _ advmod
╰─┾ 4 difícil difícil ADJ Number=Sing root
╰─╼ 5 . . PUNCT _ punct
It is also possible to print out all the dependents for each node of a given tree. As already correctly noted in the comments, tree.descendants consists of a list of nodes:
for node in tree.descendants:
print(f"{node.ord}:{node.form}")
left_children = node.children(preceding_only=True)
if len(left_children) > 0:
print("Left dependents:", end=" ")
for child in left_children:
print(f"{child.ord}:{child.form}", end=" ")
print("")
right_children = node.children(following_only=True)
if len(right_children) > 0:
print("Right dependents:", end=" ")
for child in right_children:
print(f"{child.ord}:{child.form}", end=" ")
print("")
Output #3:
1:Esto
2:es
3:muy
4:difícil
Left dependents: 1:Esto 2:es 3:muy
Right dependents: 5:.
5:.
The problem (beginning of a cycle):
for node in tree.descendants:
if node.upos == "VERB" or node.upos == "AUX":
UPDATE 1
So, I`ve come to the first somewhat complete version of a needed cycle and now it looks like this:
for i, curr_node in enumerate(nodes[1:], 1):
prev_node = nodes[i-1]
if (prev_node.upos == "PRON" or prev_node.upos == "PROPN") and (curr_node.upos == "VERB" or curr_node.upos == "AUX"):
curr_node.shift_before_node(prev_node)
But now I get this error:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-11-a967bbd730fe> in <module>()
9
10
---> 11 for i, curr_node in enumerate(nodes[1:], 1):
12 prev_node = nodes[i-1]
13 if (prev_node.upos == "PRON" or prev_node.upos == "PROPN") and (curr_node.upos == "VERB" or curr_node.upos == "AUX"):
NameError: name 'nodes' is not defined
UPDATE 2
I tried defining nodes like that:
nodes = tree.descendants
And now my cycle compiles at least, but it still didn't do anything with the structure of a given sentence:
nodes = tree.descendants
for i, curr_node in enumerate(nodes[1:], 1):
prev_node = nodes[i-1]
if (prev_node.upos == "PRON" or prev_node.upos == "PROPN") and (curr_node.upos == "VERB" or curr_node.upos == "AUX"):
curr_node.shift_before_node(prev_node)
Checking the tree:
tree = Conllu(filehandle=StringIO(parse)).read_tree()
writer = TextModeTrees(attributes="ord,form,lemma,upos,feats,deprel", layout="align")
writer.process_tree(tree)
# sent_id = 1
# text = Esto es muy difícil.
─┮
│ ╭─╼ 1 Esto este PRON Number=Sing|PronType=Dem nsubj
│ ┢─╼ 2 es ser AUX Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin cop
│ ┢─╼ 3 muy mucho ADV _ advmod
╰─┾ 4 difícil difícil ADJ Number=Sing root
╰─╼ 5 . . PUNCT _
Nothing changed.
UPDATE 3
I've also tried to check if the cycle swaps subject and predicate back again (2nd time), making the sentence look like the original one, but I guess it's not the case, becuase even if I comment the break part, flag has increased by 1 only:
nodes = tree.descendants
flag = 1
for i, curr_node in enumerate(nodes[1:], 1):
prev_node = nodes[i-1]
if ((prev_node.upos == "PRON") or (prev_node.upos == "PROPN")) and ((curr_node.upos == "VERB") or (curr_node.upos == "AUX")):
curr_node.shift_before_node(prev_node)
flag = flag + 1
# if flag == 2:
# break
print(flag)
Output
2
HOWEVER, it means, that the condition if ((prev_node.upos == "PRON") or (prev_node.upos == "PROPN")) and ((curr_node.upos == "VERB") or (curr_node.upos == "AUX")) was satisified.
Suppose there is one sentence per line in affirm.txt with affirmative Spanish sentences such as "Esto es muy difícil." or "Tus padres compraron esa casa de la que me hablaste.".
As an alternative to using the UDPipe web service, we can parse the sentences locally (I slightly prefer the es_ancora model over es_gsd):
import udapi
doc = udapi.Document('affirm.txt')
udapi.create_block('udpipe.Base', model_alias='es_ancora').apply_on_document(doc)
To make repeated experiments faster, we can now store the parsed trees to a CoNLL-U file using doc.store_conllu('affirm.conllu') and later load it using doc = udapi.Document('affirm.conllu').
To draw the trees we can use the doc.draw() method (or even tree.draw()), which is a syntactic sugar that uses TextModeTrees() behind the scenes. So to compare the sentences before and after changing the word order, we can use:
print("Original word order:")
doc.draw() # or doc.draw(attributes="ord,form,lemma,deprel,feats,misc")
for tree in doc.trees:
process_tree(tree)
print("Question-like word order:")
doc.draw()
Now comes the main work - to implement the process_tree() subroutine. Note that
We need to change the word order of the main clause only (e.g "Tus padres compraron esa casa."), not any dependent clauses (e.g. "de la que me hablaste".) So we don't want to iterate over all nodes (tree.descendants), we just need to find the main predicate (usually a verb) and its subject.
The subject does not need to be only PRON and PROPN, it can be NOUN or maybe just ADJ if the governing noun is omitted. So it is safer to just ask for deprel=nsubj (handling csubj is beyond the scope of this question).
I don't speak Spanish, but I think the rule is not as simple as moving the verb before the subject (or moving the subject after the verb). At least, we need to distinguish transitive verbs (with objects) and copula constructions. Of course, even the solution below is not perfect. It is rather an example how to use Udapi.
We should handle the nasty details like capitalization and spacing.
def process_tree(tree):
# Find the main predicate and its subject
main_predicate = tree.children[0]
nsubj = next((n for n in main_predicate.children if n.udeprel == 'nsubj'), None)
if not nsubj:
return
# Move the subject
# - after the auxiliary copula verb if present
# - or after the last object if present
# - or after the main predicate (verb)
cop = next((n for n in main_predicate.children if n.udeprel == 'cop'), None)
if cop:
nsubj.shift_after_subtree(cop)
else:
objects = [n for n in main_predicate.children if n.udeprel in ('obj', 'iobj')]
if objects:
nsubj.shift_after_subtree(objects[-1])
else:
nsubj.shift_after_node(verb)
# Fix the capitalization
nsubj_start = nsubj.descendants(add_self=True)[0]
if nsubj_start.lemma[0].islower() and nsubj_start.form[0].isupper():
nsubj_start.form = nsubj_start.form.lower()
tree.descendants[0].form = tree.descendants[0].form.capitalize()
# Add a question mark (instead of fullstop)
dots = [n for n in main_predicate.children if n.form == '.']
if not dots:
dots = [main_predicate.create_child(upos="PUNCT", deprel="punct")]
dots[-1].form = '?'
# Fix spacing
dots[-1].prev_node.misc["SpaceAfter"] = "No"
nsubj_start.prev_node.misc["SpaceAfter"] = ""
# Recompute the string representation of the sentence
tree.text = tree.compute_text()
The solution above uses Udapi as a library. An alternative would be to move the main code into a Udapi block called e.g. MakeQuestions:
from udapi.core.block import Block
class MakeQuestions(Block):
def process_tree(self, tree):
# the rest is same as in the solution above
If we store this block in the current directory in file makequestions.py, we can call it from the command line in many ways:
# parse the affirmative sentences
cat affirm.txt | udapy -s \
read.Sentences \
udpipe.Base model_alias=es_ancora \
> affirm.conllu
# visualize the output with TextModeTrees (-T)
cat affirm.conllu | udapy -T .MakeQuestions | less -R
# store the output in CoNLL-U
udapy -s .MakeQuestions < affirm.conllu > questions.conllu
# show just the plain-text sentences
udapy write.Sentences < questions.conllu > questions.txt
# visualize the differences in HTML
udapy -H \
read.Conllu files=affirm.conllu zone=affirm \
read.Conllu files=questions.conllu zone=questions \
util.MarkDiff gold_zone=affirm attributes=form ignore_parent=1 \
> differences.html

Make a list / table with 2 FOR in Python

I made a program and it was like that at the exit
A Alanina
B Ácido aspártico ou Asparagina
C Cisteína
D Ácido aspártico
E Ácido glutâmico
F Fenilalanina
G Glicina
H Histidina
I Isoleucina
J Leucina (L) ou Isoleucina
K Lisina
L Leucina
M Metionina
N Asparagina
O Pirrolisina
P Prolina
Q Glutamina
R Arginina
S Serina
T Treonina
U Selenocisteína
V Valina
W Triptofano
X qualquer
Y Tirosina
33
0
4
26
32
14
38
14
26
0
25
36
15
16
0
19
15
16
14
20
0
32
0
11
But I want these numbers to be next to the letter and word column, a list where the type would look like
-A ------ Alanina -------- number of times the A appears
-B ------ Aspartic acid or asparagine -------- number of times B appears
it is getting information from an e.coli.fasta.txt file:
>sp|A1AA21|PEPT_ECOK1 Peptidase T OS=Escherichia coli O1:K1 / APEC OX=405955 GN=pepT PE=3 SV=1
MDKLLERFLNYVSLDTQSKAGVRQVPSTEGQWKLLHLLKEQLEEMGLINVTLSEKGTLMA
TLPANVPGDIPAIGFISHVDTSPDCSGKNVNPQIVENYRGGDIALGIGDEVLSPVMFPVL
HQLLGQTLITTDGKTLLGADDKAGIAEIMTALAVLQQKNIPHGDIRVAFTPDEEVGKGAK
HFDVDAFDARWAYTVDGGGVGELEFENFNAASVNIKIVGNNVHPGTAKGVMVNALSLAAR
IHAEVPADESPEMTEGYEGFYHLASMKGTVERADMHYIIRDFDRKQFEARKRKMMEIAKK
VGKGLHPDCYIELVIEDSYYNMREKVVEHPHILDIAQQAMRDCDIEPELKPIRGGTDGAQ
LSFMGLPCPNLFTGGYNYHGKHEFVTLEGMEKAVQVIVRIAELTAQRK
and this is the program code:
f = open('e.coli.fasta.txt','r')
sequencia = f.readlines()
amino = [] #para colocar o arquivo numa lista só com o texto de interresse
for linha in sequencia:
if linha.find('>') != 0:
amino.append(linha)
tfasta= "".join(amino)
aminoacidos = {}
aminoacidos = {'A':'Alanina','B':'Ácido aspártico ou Asparagina','C':'Cisteína', 'D':'Ácido aspártico','E':'Ácido glutâmico','F':'Fenilalanina','G':'Glicina','H':'Histidina','I':'Isoleucina','J':'Leucina (L) ou Isoleucina','K':'Lisina','L':'Leucina','M':'Metionina','N':'Asparagina','O':'Pirrolisina','P':'Prolina','Q':'Glutamina','R':'Arginina','S':'Serina','T':'Treonina','U':'Selenocisteína','V':'Valina','W':'Triptofano','X':'qualquer','Y':'Tirosina'}
def ocorrencias(string):
result = {}
chaves = 'ABCDEFGHIJKLMNOPQRSTUVXY'
for i in chaves:
result[i] = tfasta.count(i)
return result
ocor = (ocorrencias(tfasta))
with open ('PeptidadeT-aminoacidos','w') as p:
for i in range(65,90):
a = ('%s' % (chr(i)))
p.write('{:4s}\t{:5s}\n'.format(a,(aminoacidos[a])))
for e in ocor.values():
p.write('{}\n'.format(e))
The variable ocor is a python data type called dictionary[1]. In your code it is compose of {key: value} = {"aminoacido": ocorrencias}. You can use aminoacid name to get the number of occurencies just like this: ocor['A'] it will return 33.
f = open('e.coli.fasta.txt','r')
sequencia = f.readlines()
amino = [] #para colocar o arquivo numa lista só com o texto de interresse
for linha in sequencia:
if linha.find('>') != 0:
amino.append(linha)
tfasta= "".join(amino)
aminoacidos = {}
aminoacidos = {'A':'Alanina','B':'Ácido aspártico ou Asparagina','C':'Cisteína', 'D':'Ácido aspártico','E':'Ácido glutâmico','F':'Fenilalanina','G':'Glicina','H':'Histidina','I':'Isoleucina','J':'Leucina (L) ou Isoleucina','K':'Lisina','L':'Leucina','M':'Metionina','N':'Asparagina','O':'Pirrolisina','P':'Prolina','Q':'Glutamina','R':'Arginina','S':'Serina','T':'Treonina','U':'Selenocisteína','V':'Valina','W':'Triptofano','X':'qualquer','Y':'Tirosina'}
def ocorrencias(string):
result = {}
chaves = 'ABCDEFGHIJKLMNOPQRSTUVWXY'
for i in chaves:
result[i] = tfasta.count(i)
return result
ocor = ocorrencias(tfasta)
with open ('PeptidadeT-aminoacidos','w') as p:
for i in range(65,90):
a = ('%s' % (chr(i)))
p.write('-{:4s}------{:5s}------{}\n'.format(a, aminoacidos[a], ocor[a]))
Just a side note here: it was missing the letter W on chaves, I just added it to prevent KeyError. If this is not wanted, you can add try/except clause on p.write.
chaves = 'ABCDEFGHIJKLMNOPQRSTUVWXY'
[1] https://realpython.com/python-dicts/

Need help writing a regex

I need a regex that reads a file with blast information.
The file looks like:
****ALIGNMENT****
Sequence: gi|516137619|ref|WP_017568199.1| hypothetical protein [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
I already have a code, but in this file there is some extra data. The variable names with the corresponding name in this example, are:
hitsid = 516137619
protein = hypothetical protein
organism = Nocardiopsis synnemataformans
length = 136
evalue = 8.9548e-11
score = 153.0
bitscore = 63.5438
identities = 35
positives = 42
gaps = 6
query = MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match = MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
subject = MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
I'm looking for something like this, this is a regex I already got, but now there are some extra things added:
p = re.compile(r'^Sequence:[^|]*\|(?P<hitsid>[^|]*)\|\S*\s*(?P<protein>[^][]*?)\s*\[(?P<organism>[^][]*)][\s\S]*?\nE-value:\s*(?P<evalue>.*)', re.MULTILINE)
File looks like:
****ALIGNMENT****
Sequence: gi|516137619|ref|WP_017568199.1| hypothetical protein [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
****ALIGNMENT****
Sequence: gi|962700925|ref|BC_420072443.1| Protein crossbronx-like [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
****ALIGNMENT****
Sequence: gi|516137619|ref|WP_017568199.1| hypothetical protein [Nocardiopsis synnemataformans]
length: 136
E_value: 8.9548e-11
score: 153.0
bit_score: 63.5438
identities: 35
positives: 42
gaps: 6
align_length: 70
query: MIRIHPASRDPQTLLDPENWRSAAWNGAPIRDCRGCIDCCDDDWNRSEPEWRRCYGEHLAEDVRHGVAVC...
match: MIRI A+RD LLDP NW S W+ A R CRGC DC + +CYGE + +DVRHGV+VC...
sbjct: MIRIDRANRDHAELLDPANWLSFHWSNAT-RACRGCDDC-----GGTTETLVQCYGEGVVDDVRHGVSVC...
You no need regexp:
parsed = []
raw_parts = open('tmp9.txt','r').read().split('****ALIGNMENT****')
for raw_part in raw_parts:
parsed_dict = {}
for line in raw_part.split('\n'):
try:
key,value = line.split(':')
parsed_dict[key] = value.strip()
except:
pass
parsed.append(parsed_dict)
print(parsed)

merge two txt files together by one common column python

how to read in two tab delimited files .txt and map them together by one common column.
For example, from these two files create a mapping of gene to pathway:
First file, pathway.txt
Pathway Protein
Binding and Uptake of Ligands by Scavenger Receptors P69905
Erythrocytes take up carbon dioxide and release oxygen P69905
Metabolism P69905
Amyloids P02647
Metabolism P02647
Hemostasis P68871
Second file, gene.txt
Gene Protein
Fabp3 P11404
HBA1 P69905
APOA1 P02647
Hbb-b1 P02088
HBB P68871
Hba P01942
output would be like,
Gene Protein Pathway
Fabp3 P11404
HBA1 P69905 Binding and Uptake of Ligands by Scavenger Receptors, Erythrocytes take up carbon dioxide and release oxygen, Metabolism
APOA1 P02647 Amyloids, Metabolism
Hbb-b1 P02088
HBB P68871 Hemostasis
Hba P01942
Leave blank if there is no pathway corresponds to gene base on the protein id information.
UPDATE:
import pandas as pd
file1= pd.read_csv("gene.csv")
file2= pd.read_csv("pathway.csv")
output = pd.concat([file1,file2]).fillna(" ")
output= output[["Gene","Protein"]+list(output.columns[1:-1])]
output.to_csv("mapping of gene to pathway.csv", index=False)
So this only gives me the merged file which is not i expected.
>>> from collections import defaultdict
>>> my_dict = defaultdict()
>>> f = open('pathway.txt')
>>> for x in f:
... x = x.strip().split()
... value,key = " ".join(x[:-1]),x[-1]
... if my_dict.get(key,0)==0:
... my_dict[key] = [value]
... else:my_dict[key].append(value)
...
>>> my_dict
defaultdict(None, {'P68871': ['Hemostasis'], 'Protein': ['Pathway'], 'P69905': ['Binding', 'Erythrocytes', 'Metabolism'], 'P02647': ['Amyloids', 'Metabolism']})
>>> f1 = open('gene.txt')
>>> for x in f1:
... value,key = x.strip().split()
... if my_dict.get(key,0)==0:
... print("{:<15}{:<15}".format(value,key))
... else: print("{:<15}{:<15}{}".format(value,key,", ".join(my_dict[key])))
...
Gene Protein Pathway
Fabp3 P11404
HBA1 P69905 Binding and Uptake of Ligands by Scavenger Receptors, Erythrocytes take up carbon dioxide and release oxygen Metabolism
APOA1 P02647 Amyloids, Metabolism
Hbb-b1 P02088
HBB P68871 Hemostasis
Hba P01942
class Protein:
def __init__(self, protein, pathway = None, gene = ""):
self.protein = protein
self.pathways = []
self.gene = gene
if pathway is not None:
self.pathways.append(pathway)
return
def __str__(self):
return "%s\t%s\t%s" % (
self.gene,
self.protein,
", ".join([p for p in self.pathways]))
# protein -> pathway map
proteins = {}
# get the pathways
f1 = file("pathways.txt")
for line in f1.readlines()[1:]:
tokens = line.split()
pathway = " ".join(tokens[:-1])
protein = tokens[-1]
if protein in proteins:
p = proteins[protein]
p.pathways.append(pathway)
else:
p = Protein(protein = protein, pathway = pathway)
proteins[protein] = p
# get the genes
f2 = file("genes.txt")
for line in f2.readlines()[1:]:
gene, protein = line.split()
if protein in proteins:
p = proteins[protein]
p.gene = gene
else:
p = Protein(protein = protein, gene = gene)
proteins[protein] = p
# print the results
print "Gene\tProtein\tPathway"
for protein in proteins.values():
print protein

Categories