I extracted some specific names of text. The text and function are described below :
import re
text = '''
def cal_revenue(revenues_store, profit_margin, average_profit):
average_profit = revenues_store * profit_margin
return average_profit
'''
# Extract names
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x != None:
values = x[0].split('def ')[1].split('(')
function_name = values[0]
arguments = values[1][:-2].split(', ')
print(f"Function Name: {function_name}")
print(f"Arguments: {arguments}")
This function works well and gives the expected results. Now I want to store all of these results in separate dictionaries
# Create dictionaries
splited_table1= dict()
splited_table2= dict()
# Extract names
def extraction_variables(text):
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x != None:
values = x[0].split('def ')[1].split('(')
splited_table1 = values[0]
splited_table2 = values[1][:-2].split(', ')
return splited_table1, splited_table2
extraction_variables(text)
splited_table1
splited_table2
But after execution of this command dictionaries are empty. So can anybody help me how to store values in dictionaries from the function above?
Try this:
import re
text = '''
def cal_revenue(revenues_store, profit_margin, average_profit):
average_profit = revenues_store * profit_margin
return average_profit
'''
splited_table1 = {}
splited_table2 = {}
# Extract names
def extraction_variables(text):
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x is not None:
values = x[0].split('def ')[1].split('(')
# function_name = values[0]
# arguments = values[1][:-2].split(', ')
splited_table1 = values[0]
splited_table2 = values[1][:-2].split(', ')
return splited_table1, splited_table2
e = extraction_variables(text)
print(e)
Not much modified but working for me.
if not working, you need to show the output of your code
Related
I want to define a function, that reads a table of a textfile as a dictionary and than use it for returning specific values. The keys are chemical symbols (like "He" for Helium,...). The values return their specific atom masses.
I don't understand, what I have to do...
The first five lines of the textfile read:
H,1.008
He,4.0026
Li,6.94
Be,9.0122
B,10.81
Here are my attempts: (I don't know where to place the parameter key so that I can define it)
def read_masses():
atom_masses = {}
with open["average_mass.csv") as f:
for line in f:
(key, value) = line.split(",")
atom_masses[key] = value
return(value)
m = read_masses("average_mass.csv)
print(m["N"]) #for the mass of nitrogen ```
once return has called, the code below it doesn't execute. What you need to return is the atom_masses not value and you have to place it outside the for loop
def read_masses(file):
atom_masses = {}
with open(file) as f:
for line in f:
(key, value) = line.split(",")
atom_masses[key] = value
return (atom_masses)
m = read_masses("average_mass.csv")
print(m["H"])
>>> 1.008
Try:
def read_masses(name):
data = {}
with open(name, "r") as f_in:
for line in map(str.strip, f_in):
if line == "":
continue
a, b = map(str.strip, line.split(",", maxsplit=1))
data[a] = float(b)
return data
m = read_masses("your_file.txt")
print(m.get("He"))
Prints:
4.0026
The loadInventory function with a string filename as parameter reads the contents from inventory.txt
def loadInventory(filename):
inventory = {}
inventoryFile = open(filename)
for line in inventoryFile:
itemID,itemStock = line.split(":")
inventory[itemID] = itemStock
inventory = {itemID: itemStock for itemID, itemStock in inventory.items()}
inventory[itemID] = [itemStock.replace('\n', '')]
# print(f"{inventory}")
return inventory
def main():
print(loadInventory('Inventory.txt'))
main()
Inventory.txt:
C05:10,10,5,4
C01:0,20,10,5
C11:10,20,10,1
C03:0,0,10,0
C10:1,1,1,1
Output:
{'C05': ['10,10,5,4'], 'C01': ['0,20,10,5'], 'C11': ['10,20,10,1'], 'C03': ['0,0,10,0'], 'C10': ['1,1,1,1']}
Intended Output:
{'C05':[10,10,5,4], 'C01':[0,20,10,5], 'C11':[10,20,10,1],
'C03':[0,0,10,0]}
Try replacing this line:
inventory[itemID] = [itemStock.replace('\n', '')]
To:
inventory[itemID] = [int(i) for i in itemStock.replace('\n', '').split(',')]
Or:
inventory[itemID] = list(map(int, itemStock.replace('\n', '').split(',')]))
I have a huge data set which contains shipper/supplier names from different sources and are having near duplicate values in it.
I tried so many different techniques available on the internet but none of them were quit satisfying or was too slow for this huge data.
I found this openrefine GitHub repo for fingerprinting algorithms and I added some more code and it solved my purpose.
Have a look.
My dataset something looks like this...
import re, string
import pandas as pd
from unidecode import unidecode
from collections import defaultdict
# clean the text before processing
def cleansing_special_characters(txt):
seps = [' ',';',':','.','`','~',',','*','#','#','|','\\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and ']
default_sep = seps[0]
txt = str(txt)
for sep in seps[1:]:
if sep == " AND " or sep == " and ":
txt = txt.upper()
txt = txt.replace(sep, ' & ')
else:
txt = txt.upper()
txt = txt.replace(sep, default_sep)
try :
list(map(int,txt.split()))
txt = 'NUMBERS'
except:
pass
txt = re.sub(' +', ' ', txt)
temp_list = [i.strip() for i in txt.split(default_sep)]
temp_list = [i for i in temp_list if i]
return " ".join(temp_list)
punctuation = re.compile('[%s]' % re.escape(string.punctuation))
class fingerprinter(object):
# __init__function
def __init__(self, string):
self.string = self._preprocess(string)
# strip leading, trailing spaces and to lower case
def _preprocess(self, string):
return punctuation.sub('',string.strip().lower())
def _latinize(self, string):
return unidecode(string)
# return unidecode(string.decode('utf-8'))
def _unique_preserve_order(self,seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
#-####################################################
def get_fingerprint(self):
return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split()))))
def get_ngram_fingerprint(self, n=1):
return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)]))))
# read excel file
df = pd.read_excel('Input_File.xlsx')
#preprocess the column
df['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters)
# step 1 cleanining
# ##for n_gram fingerprint algorithm
###########################################################################################
df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2))
## generate tag_id for every unique generated n_gram_fingerprint
d = defaultdict(lambda: len(d))
df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']]
###########################################################################################
#drop n_gram column
df.drop(columns=['n_gram_fingerprint_n2'], inplace=True)
# make copy to create group of tag_id
df1 = df[['SUPPLIER_NAME','tag_idn']]
# drop SUPPLIER_NAME column , we have tag_id's now
df.drop(columns=['SUPPLIER_NAME'], inplace=True)
# group df with tag_id with selecting minimum
#group = df.groupby('tag_id').min().reset_index()
group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()]
# join both the data frames group(unique) and main data
df_merge = pd.merge(df1,group, on=['tag_idn'])
# # output excel file
df_merge.to_excel('Output_File.xlsx', index = False)
This is what the outpout data in an excel file looks like
I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?
To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?
I want to write an INI file with duplicate options,ie:
[test]
foo = value1
foo = value2
xxx = yyy
With ConfigParser.set only the last value is writed.
config = ConfigParser.ConfigParser()
config.read('example.cfg')
config.add_section('test')
config.set('test', service['foo'], service['value1'])
config.set('test', service['foo'], service['value2'])
config.set('test', service['xxx'], service['yyy'])
The result is:
[test]
foo = value2
xxx = yyy
Is there any way?
It looks like it isn't possible in a simple way. The default way ConfigParser stores is with dict's, i.e. one value per unique key.
In a similar question Python's ConfigParser unique keys per section the suggestions are to go with:
CongfigObj
Patched version of epydoc
i have a simple custom .ini parser in python (built for another project), which uses a list to store values but only if they are not in key=value format. if key=value then last key will be held since these are stored in a dictionary
The parser can also parse nested sections like:
[SECTION1][SECTION2]
key1=value1
; etc..
The code is below, it is easy to modify to store key/value in list instead of dictionary or even detect multiple key and rename to avoid collisions (e.g key, key$1 second key with same key value key and so on). use/modify as needed
##
#
# Simple .ini Parser for Python 2.x, 3.x
#
##
import re
class Ini_Parser():
"""Simple .ini parser for Python"""
NL = None
ACTUAL = {
'\\n' : "\n",
'\\t' : "\t",
'\\v' : "\v",
'\\f' : "\f"
}
def parseStr(s, q):
_self = Ini_Parser
endq = s.find(q, 1)
quoted = s[1:endq]
rem = s[endq+1:].strip()
for c,actual in _self.ACTUAL.items():
quoted = ( actual ).join( quoted.split( c ) )
quoted = ( '\\' ).join( quoted.split( '\\\\' ) )
return quoted, rem
def fromString(s, keysList=True, rootSection='_'):
_self = Ini_Parser
comments = [';', '#']
if rootSection: rootSection = str(rootSection)
else: rootSection = '_'
if not _self.NL:
_self.NL = re.compile(r'\n\r|\r\n|\r|\n')
sections = {}
currentSection = str(rootSection)
if keysList:
sections[currentSection] = { '__list__' : [] }
else:
sections[currentSection] = { }
currentRoot = sections
# parse the lines
lines = re.split(_self.NL, str(s))
# parse it line-by-line
for line in lines:
# strip the line of extra spaces
line = line.strip()
lenline = len(line)
# comment or empty line, skip it
if not lenline or (line[0] in comments): continue
linestartswith = line[0]
# section line
if '['==linestartswith:
SECTION = True
# parse any sub-sections
while '['==linestartswith:
if SECTION:
currentRoot = sections
else:
currentRoot = currentRoot[currentSection]
SECTION = False
endsection = line.find(']', 1)
currentSection = line[1:endsection]
if currentSection not in currentRoot:
if keysList:
currentRoot[currentSection] = { '__list__' : [] }
else:
currentRoot[currentSection] = { }
# has sub-section ??
line = line[endsection+1:].strip()
if not len(line): break
linestartswith = line[0]
# key-value pairs
else:
# quoted string
if '"'==linestartswith or "'"==linestartswith:
key, line = _self.parseStr(line, linestartswith)
# key-value pair
if line.find('=', 0)>-1:
line = line.split('=')
line.pop(0)
value = "=".join(line).strip()
valuestartswith = value[0]
# quoted value
if '"'==valuestartswith or "'"==valuestartswith:
value, rem = _self.parseStr(value, valuestartswith)
currentRoot[currentSection][key] = value
# single value
else:
if keysList:
currentRoot[currentSection]['__list__'].append(key)
else:
currentRoot[currentSection][key] = True
# un-quoted string
else:
line = line.split('=')
key = line.pop(0).strip()
# single value
if 1>len(line):
if keysList:
currentRoot[currentSection]['__list__'].append(key)
else:
currentRoot[currentSection][key] = True
# key-value pair
else:
value = "=".join(line).strip()
valuestartswith = value[0]
# quoted value
if '"'==valuestartswith or "'"==valuestartswith:
value, rem = _self.parseStr(value, valuestartswith)
currentRoot[currentSection][key] = value
return sections
def fromFile(filename, keysList=True, rootSection='_'):
s = ''
with open(filename, 'r') as f: s = f.read()
return Ini_Parser.fromString(s, keysList, rootSection)
def walk(o, key=None, top='', q='', EOL="\n"):
s = ''
if len(o):
o = dict(o)
if key: keys = [key]
else: keys = o.keys()
for section in keys:
keyvals = o[section]
if not len(keyvals): continue
s += str(top) + "[" + str(section) + "]" + EOL
if ('__list__' in keyvals) and len(keyvals['__list__']):
# only values as a list
s += q + (q+EOL+q).join(keyvals['__list__']) + q + EOL
del keyvals['__list__']
if len(keyvals):
for k,v in keyvals.items():
if not len(v): continue
if isinstance(v, dict) or isinstance(v, list):
# sub-section
s += Ini_Parser.walk(keyvals, k, top + "[" + str(section) + "]", q, EOL)
else:
# key-value pair
s += q+k+q+ '=' +q+v+q + EOL
s += EOL
return s
def toString(o, rootSection='_', quote=False, EOL="\n"):
s = ''
if rootSection: root = str(rootSection)
else: root = '_'
if quote: q = '"'
else: q = ''
# dump the root section first, if exists
if root in o:
section = dict(o[root])
llist = None
if '__list__' in section:
llist = section['__list__']
if llist and isinstance(llist, list) and len(llist):
s += q + (q+EOL+q).join(llist) + q + EOL
del section['__list__']
for k,v in section.items():
if not len(v): continue
s += q+k+q+ '=' +q+v+q + EOL
s += EOL
del o[root]
# walk the sections and sub-sections, if any
s += Ini_Parser.walk(o, None, '', q, EOL)
return s
def toFile(filename, o, rootSection='_', quote=False, EOL="\n"):
with open(filename, 'w') as f:
f.write( Ini_Parser.toString(o, rootSection, quote, EOL) )
# for use with 'import *'
__all__ = [ 'Ini_Parser' ]