Storing values from specific function

Storing values from specific function - python

I extracted some specific names of text. The text and function are described below :
import re
text = '''
def cal_revenue(revenues_store, profit_margin, average_profit):
average_profit = revenues_store * profit_margin
return average_profit
'''
# Extract names
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x != None:
values = x[0].split('def ')[1].split('(')
function_name = values[0]
arguments = values[1][:-2].split(', ')
print(f"Function Name: {function_name}")
print(f"Arguments: {arguments}")
This function works well and gives the expected results. Now I want to store all of these results in separate dictionaries
# Create dictionaries
splited_table1= dict()
splited_table2= dict()
# Extract names
def extraction_variables(text):
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x != None:
values = x[0].split('def ')[1].split('(')
splited_table1 = values[0]
splited_table2 = values[1][:-2].split(', ')
return splited_table1, splited_table2
extraction_variables(text)
splited_table1
splited_table2
But after execution of this command dictionaries are empty. So can anybody help me how to store values in dictionaries from the function above?

Try this:
import re
text = '''
def cal_revenue(revenues_store, profit_margin, average_profit):
average_profit = revenues_store * profit_margin
return average_profit
'''
splited_table1 = {}
splited_table2 = {}
# Extract names
def extraction_variables(text):
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x is not None:
values = x[0].split('def ')[1].split('(')
# function_name = values[0]
# arguments = values[1][:-2].split(', ')
splited_table1 = values[0]
splited_table2 = values[1][:-2].split(', ')
return splited_table1, splited_table2
e = extraction_variables(text)
print(e)
Not much modified but working for me.
if not working, you need to show the output of your code

Related

Return a dictionary of a function

I want to define a function, that reads a table of a textfile as a dictionary and than use it for returning specific values. The keys are chemical symbols (like "He" for Helium,...). The values return their specific atom masses.
I don't understand, what I have to do...
The first five lines of the textfile read:
H,1.008
He,4.0026
Li,6.94
Be,9.0122
B,10.81
Here are my attempts: (I don't know where to place the parameter key so that I can define it)
def read_masses():
atom_masses = {}
with open["average_mass.csv") as f:
for line in f:
(key, value) = line.split(",")
atom_masses[key] = value
return(value)
m = read_masses("average_mass.csv)
print(m["N"]) #for the mass of nitrogen ```

once return has called, the code below it doesn't execute. What you need to return is the atom_masses not value and you have to place it outside the for loop
def read_masses(file):
atom_masses = {}
with open(file) as f:
for line in f:
(key, value) = line.split(",")
atom_masses[key] = value
return (atom_masses)
m = read_masses("average_mass.csv")
print(m["H"])
>>> 1.008

Try:
def read_masses(name):
data = {}
with open(name, "r") as f_in:
for line in map(str.strip, f_in):
if line == "":
continue
a, b = map(str.strip, line.split(",", maxsplit=1))
data[a] = float(b)
return data
m = read_masses("your_file.txt")
print(m.get("He"))
Prints:
4.0026

How do I remove string '' from python dictionary list values?

The loadInventory function with a string filename as parameter reads the contents from inventory.txt
def loadInventory(filename):
inventory = {}
inventoryFile = open(filename)
for line in inventoryFile:
itemID,itemStock = line.split(":")
inventory[itemID] = itemStock
inventory = {itemID: itemStock for itemID, itemStock in inventory.items()}
inventory[itemID] = [itemStock.replace('\n', '')]
# print(f"{inventory}")
return inventory
def main():
print(loadInventory('Inventory.txt'))
main()
Inventory.txt:
C05:10,10,5,4
C01:0,20,10,5
C11:10,20,10,1
C03:0,0,10,0
C10:1,1,1,1
Output:
{'C05': ['10,10,5,4'], 'C01': ['0,20,10,5'], 'C11': ['10,20,10,1'], 'C03': ['0,0,10,0'], 'C10': ['1,1,1,1']}
Intended Output:
{'C05':[10,10,5,4], 'C01':[0,20,10,5], 'C11':[10,20,10,1],
'C03':[0,0,10,0]}

Try replacing this line:
inventory[itemID] = [itemStock.replace('\n', '')]
To:
inventory[itemID] = [int(i) for i in itemStock.replace('\n', '').split(',')]
Or:
inventory[itemID] = list(map(int, itemStock.replace('\n', '').split(',')]))

near duplicate detection python sql

I have a huge data set which contains shipper/supplier names from different sources and are having near duplicate values in it.
I tried so many different techniques available on the internet but none of them were quit satisfying or was too slow for this huge data.
I found this openrefine GitHub repo for fingerprinting algorithms and I added some more code and it solved my purpose.
Have a look.
My dataset something looks like this...

import re, string
import pandas as pd
from unidecode import unidecode
from collections import defaultdict
# clean the text before processing
def cleansing_special_characters(txt):
seps = [' ',';',':','.','`','~',',','*','#','#','|','\\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and ']
default_sep = seps[0]
txt = str(txt)
for sep in seps[1:]:
if sep == " AND " or sep == " and ":
txt = txt.upper()
txt = txt.replace(sep, ' & ')
else:
txt = txt.upper()
txt = txt.replace(sep, default_sep)
try :
list(map(int,txt.split()))
txt = 'NUMBERS'
except:
pass
txt = re.sub(' +', ' ', txt)
temp_list = [i.strip() for i in txt.split(default_sep)]
temp_list = [i for i in temp_list if i]
return " ".join(temp_list)
punctuation = re.compile('[%s]' % re.escape(string.punctuation))
class fingerprinter(object):
# __init__function
def __init__(self, string):
self.string = self._preprocess(string)
# strip leading, trailing spaces and to lower case
def _preprocess(self, string):
return punctuation.sub('',string.strip().lower())
def _latinize(self, string):
return unidecode(string)
# return unidecode(string.decode('utf-8'))
def _unique_preserve_order(self,seq):
seen = set()
seen_add = seen.add
return [x for x in seq if not (x in seen or seen_add(x))]
#-####################################################
def get_fingerprint(self):
return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split()))))
def get_ngram_fingerprint(self, n=1):
return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)]))))
# read excel file
df = pd.read_excel('Input_File.xlsx')
#preprocess the column
df['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters)
# step 1 cleanining
# ##for n_gram fingerprint algorithm
###########################################################################################
df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2))
## generate tag_id for every unique generated n_gram_fingerprint
d = defaultdict(lambda: len(d))
df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']]
###########################################################################################
#drop n_gram column
df.drop(columns=['n_gram_fingerprint_n2'], inplace=True)
# make copy to create group of tag_id
df1 = df[['SUPPLIER_NAME','tag_idn']]
# drop SUPPLIER_NAME column , we have tag_id's now
df.drop(columns=['SUPPLIER_NAME'], inplace=True)
# group df with tag_id with selecting minimum
#group = df.groupby('tag_id').min().reset_index()
group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()]
# join both the data frames group(unique) and main data
df_merge = pd.merge(df1,group, on=['tag_idn'])
# # output excel file
df_merge.to_excel('Output_File.xlsx', index = False)
This is what the outpout data in an excel file looks like

How to do motif search using python?

I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?

To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?

How to write an INI file with ConfigParser with duplicate options

I want to write an INI file with duplicate options,ie:
[test]
foo = value1
foo = value2
xxx = yyy
With ConfigParser.set only the last value is writed.
config = ConfigParser.ConfigParser()
config.read('example.cfg')
config.add_section('test')
config.set('test', service['foo'], service['value1'])
config.set('test', service['foo'], service['value2'])
config.set('test', service['xxx'], service['yyy'])
The result is:
[test]
foo = value2
xxx = yyy
Is there any way?

It looks like it isn't possible in a simple way. The default way ConfigParser stores is with dict's, i.e. one value per unique key.
In a similar question Python's ConfigParser unique keys per section the suggestions are to go with:
CongfigObj
Patched version of epydoc

i have a simple custom .ini parser in python (built for another project), which uses a list to store values but only if they are not in key=value format. if key=value then last key will be held since these are stored in a dictionary
The parser can also parse nested sections like:
[SECTION1][SECTION2]
key1=value1
; etc..
The code is below, it is easy to modify to store key/value in list instead of dictionary or even detect multiple key and rename to avoid collisions (e.g key, key$1 second key with same key value key and so on). use/modify as needed
##
#
# Simple .ini Parser for Python 2.x, 3.x
#
##
import re
class Ini_Parser():
"""Simple .ini parser for Python"""
NL = None
ACTUAL = {
'\\n' : "\n",
'\\t' : "\t",
'\\v' : "\v",
'\\f' : "\f"
}
def parseStr(s, q):
_self = Ini_Parser
endq = s.find(q, 1)
quoted = s[1:endq]
rem = s[endq+1:].strip()
for c,actual in _self.ACTUAL.items():
quoted = ( actual ).join( quoted.split( c ) )
quoted = ( '\\' ).join( quoted.split( '\\\\' ) )
return quoted, rem
def fromString(s, keysList=True, rootSection='_'):
_self = Ini_Parser
comments = [';', '#']
if rootSection: rootSection = str(rootSection)
else: rootSection = '_'
if not _self.NL:
_self.NL = re.compile(r'\n\r|\r\n|\r|\n')
sections = {}
currentSection = str(rootSection)
if keysList:
sections[currentSection] = { '__list__' : [] }
else:
sections[currentSection] = { }
currentRoot = sections
# parse the lines
lines = re.split(_self.NL, str(s))
# parse it line-by-line
for line in lines:
# strip the line of extra spaces
line = line.strip()
lenline = len(line)
# comment or empty line, skip it
if not lenline or (line[0] in comments): continue
linestartswith = line[0]
# section line
if '['==linestartswith:
SECTION = True
# parse any sub-sections
while '['==linestartswith:
if SECTION:
currentRoot = sections
else:
currentRoot = currentRoot[currentSection]
SECTION = False
endsection = line.find(']', 1)
currentSection = line[1:endsection]
if currentSection not in currentRoot:
if keysList:
currentRoot[currentSection] = { '__list__' : [] }
else:
currentRoot[currentSection] = { }
# has sub-section ??
line = line[endsection+1:].strip()
if not len(line): break
linestartswith = line[0]
# key-value pairs
else:
# quoted string
if '"'==linestartswith or "'"==linestartswith:
key, line = _self.parseStr(line, linestartswith)
# key-value pair
if line.find('=', 0)>-1:
line = line.split('=')
line.pop(0)
value = "=".join(line).strip()
valuestartswith = value[0]
# quoted value
if '"'==valuestartswith or "'"==valuestartswith:
value, rem = _self.parseStr(value, valuestartswith)
currentRoot[currentSection][key] = value
# single value
else:
if keysList:
currentRoot[currentSection]['__list__'].append(key)
else:
currentRoot[currentSection][key] = True
# un-quoted string
else:
line = line.split('=')
key = line.pop(0).strip()
# single value
if 1>len(line):
if keysList:
currentRoot[currentSection]['__list__'].append(key)
else:
currentRoot[currentSection][key] = True
# key-value pair
else:
value = "=".join(line).strip()
valuestartswith = value[0]
# quoted value
if '"'==valuestartswith or "'"==valuestartswith:
value, rem = _self.parseStr(value, valuestartswith)
currentRoot[currentSection][key] = value
return sections
def fromFile(filename, keysList=True, rootSection='_'):
s = ''
with open(filename, 'r') as f: s = f.read()
return Ini_Parser.fromString(s, keysList, rootSection)
def walk(o, key=None, top='', q='', EOL="\n"):
s = ''
if len(o):
o = dict(o)
if key: keys = [key]
else: keys = o.keys()
for section in keys:
keyvals = o[section]
if not len(keyvals): continue
s += str(top) + "[" + str(section) + "]" + EOL
if ('__list__' in keyvals) and len(keyvals['__list__']):
# only values as a list
s += q + (q+EOL+q).join(keyvals['__list__']) + q + EOL
del keyvals['__list__']
if len(keyvals):
for k,v in keyvals.items():
if not len(v): continue
if isinstance(v, dict) or isinstance(v, list):
# sub-section
s += Ini_Parser.walk(keyvals, k, top + "[" + str(section) + "]", q, EOL)
else:
# key-value pair
s += q+k+q+ '=' +q+v+q + EOL
s += EOL
return s
def toString(o, rootSection='_', quote=False, EOL="\n"):
s = ''
if rootSection: root = str(rootSection)
else: root = '_'
if quote: q = '"'
else: q = ''
# dump the root section first, if exists
if root in o:
section = dict(o[root])
llist = None
if '__list__' in section:
llist = section['__list__']
if llist and isinstance(llist, list) and len(llist):
s += q + (q+EOL+q).join(llist) + q + EOL
del section['__list__']
for k,v in section.items():
if not len(v): continue
s += q+k+q+ '=' +q+v+q + EOL
s += EOL
del o[root]
# walk the sections and sub-sections, if any
s += Ini_Parser.walk(o, None, '', q, EOL)
return s
def toFile(filename, o, rootSection='_', quote=False, EOL="\n"):
with open(filename, 'w') as f:
f.write( Ini_Parser.toString(o, rootSection, quote, EOL) )
# for use with 'import *'
__all__ = [ 'Ini_Parser' ]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Storing values from specific function - python

Related

Return a dictionary of a function

How do I remove string '' from python dictionary list values?

near duplicate detection python sql

How to do motif search using python?

How to write an INI file with ConfigParser with duplicate options

Categories

Resources