Python dictionary generation, too many variables to unpack - python
Trying to generate a dictionary from a list of data parsed from .csv files. Getting the error "too many values to unpack", any got any ideas on a fix?
There will be repeating keys/mutliple values to append to each key.
Im pretty new to python and programming so please if you could add a short explanation of what went wrong/how to fix.
Below the script is the data how it appears when res is printed.
#!/usr/bin/python
import csv
import pprint
pp = pprint.PrettyPrinter(indent=4)
import sys
import getopt
res = []
import argparse
parser = argparse.ArgumentParser()
parser.add_argument ("infile", metavar="CSV", nargs="+", type=str, help="data file")
args = parser.parse_args()
with open("out.csv","wb") as f:
output = csv.writer(f)
for filename in args.infile:
for line in csv.reader(open(filename)):
for item in line[2:]:
#to skip empty cells
if not item.strip():
continue
item = item.split(":")
item[1] = item[1].rstrip("%")
# print([line[1]+item[0],item[1]])
res.append([line[1]+item[0],item[1]])
# output.writerow([line[1]+item[0],item[1].rstrip("%")])
pp.pprint( res )
from collections import defaultdict
initial_list = [res]
d = defaultdict(list)
pp.pprint( d )
for k, v in initial_list:
d[k].append(float(v)) # possibly `int(v)` ?
and the console
[ ['P1L', '2.04'],
['Q2R', '1.93'],
['V3I', '20.03'],
['V3M', '78.18'],
['V3S', '1.67'],
['T4L', '1.16'],
['T12N', '75.60'],
['T12S', '22.73'],
['K14E', '1.03'],
['K14R', '50.65'],
['I15*', '63.94'],
['I15V', '35.30'],
['G17A', '38.31'],
['Q18R', '38.43'],
['L19T', '98.62'],
['L24*', '2.18'],
['D25E', '1.87'],
['D25N', '2.17'],
['M36I', '99.76'],
['S37N', '97.23'],
['R41K', '99.03'],
['L63V', '99.42'],
['H69K', '99.30'],
['I72V', '5.76'],
['V82I', '98.70'],
['L89M', '98.49'],
['I93L', '99.64'],
['P4S', '99.09'],
['V35T', '99.26'],
['E36A', '98.23'],
['T39D', '98.78'],
['G45R', '3.11'],
['S48T', '99.70'],
['V60I', '99.44'],
['K102R', '1.04'],
['K103N', '99.11'],
['G112E', '2.77'],
['D123N', '8.14'],
['D123S', '91.12'],
['I132M', '1.41'],
['K173A', '99.55'],
['Q174K', '99.68'],
['D177E', '98.95'],
['G190R', '2.56'],
['E194K', '2.54'],
['T200A', '99.28'],
['Q207E', '98.75'],
['R211K', '98.77'],
['W212*', '3.00'],
['L214F', '99.25'],
['V245E', '99.30'],
['E248D', '99.58'],
['D250E', '99.02'],
['T286A', '99.70'],
['K287R', '1.78'],
['E291D', '99.22'],
['V292I', '98.28'],
['I293V', '99.58'],
['V317A', '28.20'],
['L325V', '2.40'],
['G335D', '98.33'],
['F346S', '4.42'],
['N348I', '3.81'],
['R356K', '71.43'],
['M357I', '20.00'],
['M357T', '80.00']]
defaultdict(<type 'list'>, {})
Traceback (most recent call last):
File "test.py", line 40, in <module
for k, v in initial_list:
ValueError: too many values to unpack
You are wrapping the result in a list:
initial_list = [res]
then try to iterate over the list:
d = defaultdict(list)
pp.pprint( d )
for k, v in initial_list:
d[k].append(float(v)) # possibly `int(v)` ?
You want to loop over res instead:
d = defaultdict(list)
for k, v in res:
d[k].append(float(v))
You can do all this in the CSV reading loop:
from collections import defaultdict
d = defaultdict(list)
with open("out.csv","wb") as f:
output = csv.writer(f)
for filename in args.infile:
for line in csv.reader(open(filename)):
for item in line[2:]:
#to skip empty cells
if not item.strip():
continue
key, value = item.split(":", 1)
value = value.rstrip("%")
d[line1[1] + key].append(float(value))
Related
Python: put numbers into list
I have a text file that looks like this: 1 acatccacgg atgaaggaga ggagaaatgt ttcaaatcag ttctaacacg aaaaccaatt 61 ccaagaccaa gttatgaaat taccactaag cagcagtgaa agaactacat attgaagtca 121 gataagaaag caagctgaag agcaagcact gggcatcttt cttgaaaaaa gtaaggccca 181 agtaacagac tatcagattt ttttgcagtc tttgcattcc tactagatga ttcacagaga 241 agatagtcac atttatcatt cgaaaacatg aaagaattcc agtcagaact tgcatttggg 301 ggcatgtaag tctcaaggtt gtctttttgc caatgtgctg taacattatt gcactcagag 361 tgtactgctg acagccactg ttctgccgaa atgacagaaa atagggaaca I am trying to read the txt file and make a dictionary that puts the text information into a dictionary like this: {1:[acatccacgg,atgaaggaga, ggagaaatgt, ttcaaatcag, ttctaacacg, aaaaccaatt], 61 : ...} I have no clue how to do this...I am really new to python
you can try this line of code. f = open('test.txt','r') mydictionary = {} for x in f: temp = x.strip().split(' ') mydictionary.update({temp[0]:temp[1:]}) f.close() print(mydictionary)
this is the cleaner, and more readable way to do so (just try it, and you will understand): import re from os.path import exists def put_in_dict(directory: str): """With this function you can find the digits's in every line and then put it in keys and then you can put the character's in the same line as value to that key.""" my_dict = {} pattern_digit = re.compile(r"\d+") pattern_char = re.compile(r"\w+") char = [] if exists(directory): with open(f"{directory}") as file: all_text = file.read().strip() list_txt = all_text.splitlines() numbs = pattern_digit.findall(all_text) for num in range(len(list_txt)): char.append(pattern_char.findall(list_txt[num])) del char[num][0] for dict_set in range(len(numbs)): my_dict[numbs[dict_set]] = char[dict_set] return my_dict # you could make it print(my_dict) too
Parsing blocks of text data with python itertools.groupby
I'm trying to parse a blocks of text in python 2.7 using itertools.groupby The data has the following structure: BEGIN IONS TITLE=cmpd01_scan=23 RTINSECONDS=14.605 PEPMASS=694.299987792969 505975.375 CHARGE=2+ 615.839727 1760.3752441406 628.788226 2857.6264648438 922.4323436 2458.0959472656 940.4432533 9105.5 END IONS BEGIN IONS TITLE=cmpd01_scan=24 RTINSECONDS=25.737 PEPMASS=694.299987792969 505975.375 CHARGE=2+ 575.7636234 1891.1656494141 590.3553938 2133.4477539063 615.8339562 2433.4252929688 615.9032114 1784.0628662109 END IONS I need to extract information from the line beigining with "TITLE=", "PEPMASS=","CHARGE=". The code I'm using as follows: import itertools import re data_file='Test.mgf' def isa_group_separator(line): return line=='END IONS\n' regex_scan = re.compile(r'TITLE=') regex_precmass=re.compile(r'PEPMASS=') regex_charge=re.compile(r'CHARGE=') with open(data_file) as f: for (key,group) in itertools.groupby(f,isa_group_separator): #print(key,list(group)) if not key: precmass_match = filter(regex_precmass.search,group) print precmass_match scan_match= filter(regex_scan.search,group) print scan_match charge_match = filter(regex_charge.search,group) print charge_match However, the output only picks up the "PEPMASS=" line,and if 'scan_match' assignment is done before 'precmass_match', the "TITLE=" line is printed only; > ['PEPMASS=694.299987792969 505975.375\n'] [] [] > ['PEPMASS=694.299987792969 505975.375\n'] [] [] can someone point out what I'm doing wrong here?
The reason for this is that group is an iterator and it runs only once. Please find the modified script that does the job. import itertools import re data_file='Test.mgf' def isa_group_separator(line): return line == 'END IONS\n' regex_scan = re.compile(r'TITLE=') regex_precmass = re.compile(r'PEPMASS=') regex_charge = re.compile(r'CHARGE=') with open(data_file) as f: for (key, group) in itertools.groupby(f, isa_group_separator): if not key: g = list(group) precmass_match = filter(regex_precmass.search, g) print precmass_match scan_match = filter(regex_scan.search, g) print scan_match charge_match = filter(regex_charge.search, g) print charge_match
I might try to parse this way (without using groupby( import re file = """\ BEGIN IONS TITLE=cmpd01_scan=23 RTINSECONDS=14.605 PEPMASS=694.299987792969 505975.375 CHARGE=2+ 615.839727 1760.3752441406 628.788226 2857.6264648438 922.4323436 2458.0959472656 940.4432533 9105.5 END IONS BEGIN IONS TITLE=cmpd01_scan=24 RTINSECONDS=25.737 PEPMASS=694.299987792969 505975.375 CHARGE=2+ 575.7636234 1891.1656494141 590.3553938 2133.4477539063 615.8339562 2433.4252929688 615.9032114 1784.0628662109 END IONS""".splitlines() pat = re.compile(r'(TITLE|PEPMASS|CHARGE)=(.+)') data = [] for line in file: m = pat.match(line) if m is not None: if m.group(1) == 'TITLE': data.append([]) data[-1].append(m.group(2)) print(data) Prints: [['cmpd01_scan=23', '694.299987792969 505975.375', '2+'], ['cmpd01_scan=24', '694.299987792969 505975.375', '2+']]
Aggregating values in one column by their corresponding value in another from two files
had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example: 1:5 2:4 3:2 1:4 Very basic but I'm looking for an output that looks like: 1:9 2:4 3:2 In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3). I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as: Britney Spears (289) 2393140. Any help or input would be so appreciated. import codecs #from collections import defaultdict with codecs.open("artists.dat", encoding = "utf-8") as f: artists = f.readlines() with codecs.open("user_artists.dat", encoding = "utf-8") as f: users = f.readlines() artist_list = [x.strip().split('\t') for x in artists][1:] user_stats_list = [x.strip().split('\t') for x in users][1:] artists = {} for a in artist_list: artistID, name = a[0], a[1] artists[artistID] = name grouped_user_stats = {} for u in user_stats_list: userID, artistID, weight = u grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int) grouped_user_stats[weight] = grouped_user_stats[weight].astype(int) for artistID, weight in u: grouped_user_stats.groupby('artistID')['weight'].sum() print(grouped_user_stats.groupby('artistID')['weight'].sum()) #if userID not in grouped_user_stats: #grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} } #else: #if artistID not in grouped_user_stats[userID]: #grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1} #else: #grouped_user_stats[userID][artistID]['plays'] += 1 #print('this never happens') #print(grouped_user_stats)
how about: import codecs from collections import defaultdict # read stuff with codecs.open("artists.dat", encoding = "utf-8") as f: artists = f.readlines() with codecs.open("user_artists.dat", encoding = "utf-8") as f: users = f.readlines() # transform artist data in a dict with "artist id" as key and "artist name" as value artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:]) user_stats_list = [x.strip().split('\t') for x in users][1:] grouped_user_stats = defaultdict(lambda:0) for u in user_stats_list: #userID, artistID, weight = u grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values # extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() ) # lastly print it for k, v in grouped_user_stats.iteritems(): print k,v
create a list of list of parameters from a file
Hi i am trying to create a list of parameters from a file The final result should be something like param=[[field],[units],[height],[site]] The problem is that the information is split into lines and some of the parameters do not have all the information #info in the file [field1] unit=m/s height=70.4 site=site1 [field2] height=20.6 site=site2 [field3] units=m ... so i would like to fulfill all the fields in such a way that, if there is not information assigns 0 or '' Final result in the example param={field1:'m/s',70.4,'site1',field2:'',20.6,site2, field3:'m',0,''} I know how to create a dictionary from list of lists but not to set default values ('' for the strings values an 0 for the numeric ones) in case some values are missing Thanks
You could group using a defaultdict: from collections import defaultdict with open("test.txt") as f: d = defaultdict(list) for line in map(str.rstrip, f): if line.startswith("["): d["fields"].append(line.strip("[]")) else: k,v = line.split("=") d[k].append(v) Input:: [field1] unit=m/s height=70.4 site=site1 [field2] height=20.6 site=site2 [field3] unit=m height=6.0 site=site3 Output: defaultdict(<type 'list'>, {'fields': ['field1', 'field2', 'field3'], 'site': ['site1', 'site2', 'site3'], 'unit': ['m/s', 'm'], 'height': ['70.4', '20.6', '6.0']}) If you actually want to group by field, you can use itertools.groupby grouping on lines that start with [: from itertools import groupby with open("test.txt") as f: grps, d = groupby(map(str.rstrip,f), key=lambda x: x.startswith("[")), {} for k,v in grps: if k: k, v = next(v).strip("[]"), list(next(grps)[1]) d[k] = v print(d) Output: {'field2': ['height=20.6', 'site=site2'], 'field3': ['unit=m', 'height=6.0', 'site=site3'], 'field1': ['unit=m/s', 'height=70.4', 'site=site1']} Each k is a line starting with [, we then call next on the grouper object to get all the lines up to the next line starting with [ or the EOF:
This would fill in the missing information. f= open('file.txt','r') field, units, height, site = [],[],[],[] param = [ field, units, height, site] lines = f.readlines() i=0 while True: try: line1 = lines[i].rstrip() if line1.startswith('['): field.append(line1.strip('[]')) else: field.append(0) i-= 1 except: field.append(0) try: line2 = lines[i+1].rstrip() if line2.startswith('unit') or line2.startswith('units'): units.append(line2.split('=')[-1]) else: units.append('') i-=1 except: units.append('') try: line3 = lines[i+2].rstrip() if line3.startswith('height'): height.append(line3.split('=')[-1]) else: height.append(0) i-=1 except: height.append(0) try: line4 = lines[i+3].rstrip() if line4.startswith('site'): site.append(line4.split('=')[-1]) else: site.append('') except: site.append('') break i +=4 Output: param: [['field1', 'field2', 'field3'], ['m/s', '', 'm'], ['70.4', '20.6', 0], ['site1', 'site2', '']]
Append values in the same key of a dictionary
How to add different values in the same key of a dictionary? These different values are added in a loop. Below is what I desired entries in the dictionary data_dict data_dict = {} And during each iterations, output should looks like: Iteration1 -> {'HUBER': {'100': 5.42}} Iteration2 -> {'HUBER': {'100': 5.42, '10': 8.34}} Iteration3 -> {'HUBER': {'100': 5.42, '10': 8.34, '20': 7.75}} etc However, at the end of the iterations, data_dict is left with the last entry only: {'HUBER': {'80': 5.50}} Here's the code: import glob path = "./meanFilesRun2/*.txt" all_files = glob.glob(path) data_dict = {} def func_(all_lines, method, points, data_dict): if method == "HUBER": mean_error = float(all_lines[-1]) # end of the file contains total_error data_dict["HUBER"] = {points: mean_error} return data_dict elif method == "L1": mean_error = float(all_lines[-1]) data_dict["L1"] = {points: mean_error} return data_dict for file_ in all_files: lineMthds = file_.split("_")[1] # reading line methods like "HUBER/L1/L2..." algoNum = file_.split("_")[-2] # reading diff. algos number used like "1/2.." points = file_.split("_")[2] # diff. points used like "10/20/30..." if algoNum == "1": FI = open(file_, "r") all_lines = FI.readlines() data_dict = func_(all_lines, lineMthds, points, data_dict) print data_dict FI.close()
You can use dict.setdefault here. Currently the problem with your code is that in each call to func_ you're re-assigning data_dict["HUBER"] to a new dict. Change: data_dict["HUBER"] = {points: mean_error} to: data_dict.setdefault("HUBER", {})[points] = mean_error
You can use defaultdict from the collections module: import collections d = collections.defaultdict(dict) d['HUBER']['100'] = 5.42 d['HUBER']['10'] = 3.45