Python dictionary generation, too many variables to unpack - python

Trying to generate a dictionary from a list of data parsed from .csv files. Getting the error "too many values to unpack", any got any ideas on a fix?
There will be repeating keys/mutliple values to append to each key.
Im pretty new to python and programming so please if you could add a short explanation of what went wrong/how to fix.
Below the script is the data how it appears when res is printed.
#!/usr/bin/python
import csv
import pprint
pp = pprint.PrettyPrinter(indent=4)
import sys
import getopt
res = []
import argparse
parser = argparse.ArgumentParser()
parser.add_argument ("infile", metavar="CSV", nargs="+", type=str, help="data file")
args = parser.parse_args()
with open("out.csv","wb") as f:
output = csv.writer(f)
for filename in args.infile:
for line in csv.reader(open(filename)):
for item in line[2:]:
#to skip empty cells
if not item.strip():
continue
item = item.split(":")
item[1] = item[1].rstrip("%")
# print([line[1]+item[0],item[1]])
res.append([line[1]+item[0],item[1]])
# output.writerow([line[1]+item[0],item[1].rstrip("%")])
pp.pprint( res )
from collections import defaultdict
initial_list = [res]
d = defaultdict(list)
pp.pprint( d )
for k, v in initial_list:
d[k].append(float(v)) # possibly `int(v)` ?
and the console
[ ['P1L', '2.04'],
['Q2R', '1.93'],
['V3I', '20.03'],
['V3M', '78.18'],
['V3S', '1.67'],
['T4L', '1.16'],
['T12N', '75.60'],
['T12S', '22.73'],
['K14E', '1.03'],
['K14R', '50.65'],
['I15*', '63.94'],
['I15V', '35.30'],
['G17A', '38.31'],
['Q18R', '38.43'],
['L19T', '98.62'],
['L24*', '2.18'],
['D25E', '1.87'],
['D25N', '2.17'],
['M36I', '99.76'],
['S37N', '97.23'],
['R41K', '99.03'],
['L63V', '99.42'],
['H69K', '99.30'],
['I72V', '5.76'],
['V82I', '98.70'],
['L89M', '98.49'],
['I93L', '99.64'],
['P4S', '99.09'],
['V35T', '99.26'],
['E36A', '98.23'],
['T39D', '98.78'],
['G45R', '3.11'],
['S48T', '99.70'],
['V60I', '99.44'],
['K102R', '1.04'],
['K103N', '99.11'],
['G112E', '2.77'],
['D123N', '8.14'],
['D123S', '91.12'],
['I132M', '1.41'],
['K173A', '99.55'],
['Q174K', '99.68'],
['D177E', '98.95'],
['G190R', '2.56'],
['E194K', '2.54'],
['T200A', '99.28'],
['Q207E', '98.75'],
['R211K', '98.77'],
['W212*', '3.00'],
['L214F', '99.25'],
['V245E', '99.30'],
['E248D', '99.58'],
['D250E', '99.02'],
['T286A', '99.70'],
['K287R', '1.78'],
['E291D', '99.22'],
['V292I', '98.28'],
['I293V', '99.58'],
['V317A', '28.20'],
['L325V', '2.40'],
['G335D', '98.33'],
['F346S', '4.42'],
['N348I', '3.81'],
['R356K', '71.43'],
['M357I', '20.00'],
['M357T', '80.00']]
defaultdict(<type 'list'>, {})
Traceback (most recent call last):
File "test.py", line 40, in <module
for k, v in initial_list:
ValueError: too many values to unpack

You are wrapping the result in a list:
initial_list = [res]
then try to iterate over the list:
d = defaultdict(list)
pp.pprint( d )
for k, v in initial_list:
d[k].append(float(v)) # possibly `int(v)` ?
You want to loop over res instead:
d = defaultdict(list)
for k, v in res:
d[k].append(float(v))
You can do all this in the CSV reading loop:
from collections import defaultdict
d = defaultdict(list)
with open("out.csv","wb") as f:
output = csv.writer(f)
for filename in args.infile:
for line in csv.reader(open(filename)):
for item in line[2:]:
#to skip empty cells
if not item.strip():
continue
key, value = item.split(":", 1)
value = value.rstrip("%")
d[line1[1] + key].append(float(value))

Related

Python: put numbers into list

I have a text file that looks like this:
1 acatccacgg atgaaggaga ggagaaatgt ttcaaatcag ttctaacacg aaaaccaatt
61 ccaagaccaa gttatgaaat taccactaag cagcagtgaa agaactacat attgaagtca
121 gataagaaag caagctgaag agcaagcact gggcatcttt cttgaaaaaa gtaaggccca
181 agtaacagac tatcagattt ttttgcagtc tttgcattcc tactagatga ttcacagaga
241 agatagtcac atttatcatt cgaaaacatg aaagaattcc agtcagaact tgcatttggg
301 ggcatgtaag tctcaaggtt gtctttttgc caatgtgctg taacattatt gcactcagag
361 tgtactgctg acagccactg ttctgccgaa atgacagaaa atagggaaca
I am trying to read the txt file and make a dictionary that puts the text information into a dictionary like this: {1:[acatccacgg,atgaaggaga, ggagaaatgt, ttcaaatcag, ttctaacacg, aaaaccaatt], 61 : ...}
I have no clue how to do this...I am really new to python
you can try this line of code.
f = open('test.txt','r')
mydictionary = {}
for x in f:
temp = x.strip().split(' ')
mydictionary.update({temp[0]:temp[1:]})
f.close()
print(mydictionary)
this is the cleaner, and more readable way to do so (just try it, and you will understand):
import re
from os.path import exists
def put_in_dict(directory: str):
"""With this function you can find the digits's in every line and
then put it in keys and then you can put the character's in the same line
as value to that key."""
my_dict = {}
pattern_digit = re.compile(r"\d+")
pattern_char = re.compile(r"\w+")
char = []
if exists(directory):
with open(f"{directory}") as file:
all_text = file.read().strip()
list_txt = all_text.splitlines()
numbs = pattern_digit.findall(all_text)
for num in range(len(list_txt)):
char.append(pattern_char.findall(list_txt[num]))
del char[num][0]
for dict_set in range(len(numbs)):
my_dict[numbs[dict_set]] = char[dict_set]
return my_dict # you could make it print(my_dict) too

Parsing blocks of text data with python itertools.groupby

I'm trying to parse a blocks of text in python 2.7 using itertools.groupby
The data has the following structure:
BEGIN IONS
TITLE=cmpd01_scan=23
RTINSECONDS=14.605
PEPMASS=694.299987792969 505975.375
CHARGE=2+
615.839727 1760.3752441406
628.788226 2857.6264648438
922.4323436 2458.0959472656
940.4432533 9105.5
END IONS
BEGIN IONS
TITLE=cmpd01_scan=24
RTINSECONDS=25.737
PEPMASS=694.299987792969 505975.375
CHARGE=2+
575.7636234 1891.1656494141
590.3553938 2133.4477539063
615.8339562 2433.4252929688
615.9032114 1784.0628662109
END IONS
I need to extract information from the line beigining with "TITLE=", "PEPMASS=","CHARGE=".
The code I'm using as follows:
import itertools
import re
data_file='Test.mgf'
def isa_group_separator(line):
return line=='END IONS\n'
regex_scan = re.compile(r'TITLE=')
regex_precmass=re.compile(r'PEPMASS=')
regex_charge=re.compile(r'CHARGE=')
with open(data_file) as f:
for (key,group) in itertools.groupby(f,isa_group_separator):
#print(key,list(group))
if not key:
precmass_match = filter(regex_precmass.search,group)
print precmass_match
scan_match= filter(regex_scan.search,group)
print scan_match
charge_match = filter(regex_charge.search,group)
print charge_match
However, the output only picks up the "PEPMASS=" line,and if 'scan_match' assignment is done before 'precmass_match', the "TITLE=" line is printed only;
> ['PEPMASS=694.299987792969 505975.375\n'] [] []
> ['PEPMASS=694.299987792969 505975.375\n'] [] []
can someone point out what I'm doing wrong here?
The reason for this is that group is an iterator and it runs only once.
Please find the modified script that does the job.
import itertools
import re
data_file='Test.mgf'
def isa_group_separator(line):
return line == 'END IONS\n'
regex_scan = re.compile(r'TITLE=')
regex_precmass = re.compile(r'PEPMASS=')
regex_charge = re.compile(r'CHARGE=')
with open(data_file) as f:
for (key, group) in itertools.groupby(f, isa_group_separator):
if not key:
g = list(group)
precmass_match = filter(regex_precmass.search, g)
print precmass_match
scan_match = filter(regex_scan.search, g)
print scan_match
charge_match = filter(regex_charge.search, g)
print charge_match
I might try to parse this way (without using groupby(
import re
file = """\
BEGIN IONS
TITLE=cmpd01_scan=23
RTINSECONDS=14.605
PEPMASS=694.299987792969 505975.375
CHARGE=2+
615.839727 1760.3752441406
628.788226 2857.6264648438
922.4323436 2458.0959472656
940.4432533 9105.5
END IONS
BEGIN IONS
TITLE=cmpd01_scan=24
RTINSECONDS=25.737
PEPMASS=694.299987792969 505975.375
CHARGE=2+
575.7636234 1891.1656494141
590.3553938 2133.4477539063
615.8339562 2433.4252929688
615.9032114 1784.0628662109
END IONS""".splitlines()
pat = re.compile(r'(TITLE|PEPMASS|CHARGE)=(.+)')
data = []
for line in file:
m = pat.match(line)
if m is not None:
if m.group(1) == 'TITLE':
data.append([])
data[-1].append(m.group(2))
print(data)
Prints:
[['cmpd01_scan=23', '694.299987792969 505975.375', '2+'], ['cmpd01_scan=24', '694.299987792969 505975.375', '2+']]

Aggregating values in one column by their corresponding value in another from two files

had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example:
1:5
2:4
3:2
1:4
Very basic but I'm looking for an output that looks like:
1:9
2:4
3:2
In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3).
I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as:
Britney Spears (289) 2393140. Any help or input would be so appreciated.
import codecs
#from collections import defaultdict
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
artist_list = [x.strip().split('\t') for x in artists][1:]
user_stats_list = [x.strip().split('\t') for x in users][1:]
artists = {}
for a in artist_list:
artistID, name = a[0], a[1]
artists[artistID] = name
grouped_user_stats = {}
for u in user_stats_list:
userID, artistID, weight = u
grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int)
grouped_user_stats[weight] = grouped_user_stats[weight].astype(int)
for artistID, weight in u:
grouped_user_stats.groupby('artistID')['weight'].sum()
print(grouped_user_stats.groupby('artistID')['weight'].sum())
#if userID not in grouped_user_stats:
#grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} }
#else:
#if artistID not in grouped_user_stats[userID]:
#grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1}
#else:
#grouped_user_stats[userID][artistID]['plays'] += 1
#print('this never happens')
#print(grouped_user_stats)
how about:
import codecs
from collections import defaultdict
# read stuff
with codecs.open("artists.dat", encoding = "utf-8") as f:
artists = f.readlines()
with codecs.open("user_artists.dat", encoding = "utf-8") as f:
users = f.readlines()
# transform artist data in a dict with "artist id" as key and "artist name" as value
artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:])
user_stats_list = [x.strip().split('\t') for x in users][1:]
grouped_user_stats = defaultdict(lambda:0)
for u in user_stats_list:
#userID, artistID, weight = u
grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values
# extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format
grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() )
# lastly print it
for k, v in grouped_user_stats.iteritems():
print k,v

create a list of list of parameters from a file

Hi i am trying to create a list of parameters from a file
The final result should be something like
param=[[field],[units],[height],[site]]
The problem is that the information is split into lines and some of the parameters do not have all the information
#info in the file
[field1]
unit=m/s
height=70.4
site=site1
[field2]
height=20.6
site=site2
[field3]
units=m
...
so i would like to fulfill all the fields in such a way that, if there is not information assigns 0 or ''
Final result in the example
param={field1:'m/s',70.4,'site1',field2:'',20.6,site2, field3:'m',0,''}
I know how to create a dictionary from list of lists but not to set default values ('' for the strings values an 0 for the numeric ones) in case some values are missing
Thanks
You could group using a defaultdict:
from collections import defaultdict
with open("test.txt") as f:
d = defaultdict(list)
for line in map(str.rstrip, f):
if line.startswith("["):
d["fields"].append(line.strip("[]"))
else:
k,v = line.split("=")
d[k].append(v)
Input::
[field1]
unit=m/s
height=70.4
site=site1
[field2]
height=20.6
site=site2
[field3]
unit=m
height=6.0
site=site3
Output:
defaultdict(<type 'list'>, {'fields': ['field1', 'field2', 'field3'],
'site': ['site1', 'site2', 'site3'], 'unit': ['m/s', 'm'],
'height': ['70.4', '20.6', '6.0']})
If you actually want to group by field, you can use itertools.groupby grouping on lines that start with [:
from itertools import groupby
with open("test.txt") as f:
grps, d = groupby(map(str.rstrip,f), key=lambda x: x.startswith("[")), {}
for k,v in grps:
if k:
k, v = next(v).strip("[]"), list(next(grps)[1])
d[k] = v
print(d)
Output:
{'field2': ['height=20.6', 'site=site2'],
'field3': ['unit=m', 'height=6.0', 'site=site3'],
'field1': ['unit=m/s', 'height=70.4', 'site=site1']}
Each k is a line starting with [, we then call next on the grouper object to get all the lines up to the next line starting with [ or the EOF:
This would fill in the missing information.
f= open('file.txt','r')
field, units, height, site = [],[],[],[]
param = [ field, units, height, site]
lines = f.readlines()
i=0
while True:
try:
line1 = lines[i].rstrip()
if line1.startswith('['):
field.append(line1.strip('[]'))
else:
field.append(0)
i-= 1
except:
field.append(0)
try:
line2 = lines[i+1].rstrip()
if line2.startswith('unit') or line2.startswith('units'):
units.append(line2.split('=')[-1])
else:
units.append('')
i-=1
except:
units.append('')
try:
line3 = lines[i+2].rstrip()
if line3.startswith('height'):
height.append(line3.split('=')[-1])
else:
height.append(0)
i-=1
except:
height.append(0)
try:
line4 = lines[i+3].rstrip()
if line4.startswith('site'):
site.append(line4.split('=')[-1])
else:
site.append('')
except:
site.append('')
break
i +=4
Output:
param:
[['field1', 'field2', 'field3'],
['m/s', '', 'm'],
['70.4', '20.6', 0],
['site1', 'site2', '']]

Append values in the same key of a dictionary

How to add different values in the same key of a dictionary? These different values are added
in a loop.
Below is what I desired entries in the dictionary data_dict
data_dict = {}
And during each iterations, output should looks like:
Iteration1 -> {'HUBER': {'100': 5.42}}
Iteration2 -> {'HUBER': {'100': 5.42, '10': 8.34}}
Iteration3 -> {'HUBER': {'100': 5.42, '10': 8.34, '20': 7.75}} etc
However, at the end of the iterations, data_dict is left with the last entry only:
{'HUBER': {'80': 5.50}}
Here's the code:
import glob
path = "./meanFilesRun2/*.txt"
all_files = glob.glob(path)
data_dict = {}
def func_(all_lines, method, points, data_dict):
if method == "HUBER":
mean_error = float(all_lines[-1]) # end of the file contains total_error
data_dict["HUBER"] = {points: mean_error}
return data_dict
elif method == "L1":
mean_error = float(all_lines[-1])
data_dict["L1"] = {points: mean_error}
return data_dict
for file_ in all_files:
lineMthds = file_.split("_")[1] # reading line methods like "HUBER/L1/L2..."
algoNum = file_.split("_")[-2] # reading diff. algos number used like "1/2.."
points = file_.split("_")[2] # diff. points used like "10/20/30..."
if algoNum == "1":
FI = open(file_, "r")
all_lines = FI.readlines()
data_dict = func_(all_lines, lineMthds, points, data_dict)
print data_dict
FI.close()
You can use dict.setdefault here. Currently the problem with your code is that in each call to func_ you're re-assigning data_dict["HUBER"] to a new dict.
Change:
data_dict["HUBER"] = {points: mean_error}
to:
data_dict.setdefault("HUBER", {})[points] = mean_error
You can use defaultdict from the collections module:
import collections
d = collections.defaultdict(dict)
d['HUBER']['100'] = 5.42
d['HUBER']['10'] = 3.45

Categories