python script to concatenate values by row and delete identical - python

I am using python 2.7, and I have a text file that looks like this:
id value
--- ----
1 x
2 a
1 z
1 y
2 b
I am trying to get an ouput that looks like this:
id value
--- ----
1 x,z,y
2 a,b
Much appreciated!

The simplest solution would be to use collections.defaultdict and collections.OrderedDict. If you don't care about order you could also use sets instead of OrderedDict.
from collections import defaultdict, OrderedDict
# Keeps all unique values for each id
dd = defaultdict(OrderedDict)
# Keeps the unique ids in order of appearance
ids = OrderedDict()
with open(yourfilename) as f:
f = iter(f)
# skip first two lines
next(f), next(f)
for line in f:
id_, value = list(filter(bool, line.split())) # split at whitespace and remove empty ones
dd[id_][value] = None # dicts need a value, but here it doesn't matter which one...
ids[id_] = None
print('id value')
print('--- ----')
for id_ in ids:
print('{} {}'.format(id_, ','.join(dd[id_])))
Result:
id value
--- ----
1 x,z,y
2 a,b
In case you want to write it to another file just concatenate what I printed with \n and write it to a file.

I think this could also work, although the other answer seems more sophisticated:
input =['1,x',
'2,a',
'1,z',
'1,y',
'2,b',
'2,a', #added extra values to show duplicates won't be added
'1,z',
'1,y']
output = {}
for row in input:
parts = row.split(",")
id_ = parts[0]
value = parts[1]
if id_ not in output:
output[id_] = value
else:
a_List = list(output[id_])
if value not in a_List:
output[id_] += "," + value
else:
pass
You end up with a dictionary similar to what you requested.

#read
fp=open('','r')
d=fp.read().split("\n")
fp.close()
x=len(d)
for i in range(len(d)):
n= d[i].split()
d.append(n)
d=d[x:]
m={}
for i in d:
if i[0] not in m:
m[i[0]]=[i[1]]
else:
if i[1] not in m[i[0]]:
m[i[0]].append(i[1])
for i in m:
print i,",".join(m[i])

Related

regex - counting by unique pattern (HH:MM:__)

i want to count the unique HH:MM:xx(eg. 11:11:00, 11:12:00, 11:12:11) using regex. so far i am only able to count the total of HH:MM:SS in the text. not sure how to continue from here.. this are my codes
pattern = re.compile("(\d{2}):(\d{2}):(\d{2})") #capture all the pattern with HH:MM:SS
path = r'C:\Users\CL\Desktop\abc.txt'
list1 = [] # to store values in list
for line in open(path,'r'):
for match in re.finditer(pattern, line): #draw 11:11:00, 11:12:00, 11:12:11
list1.append(line) #append to a list
total = len(list1) #sum list
print(total) #3
sample text
11:11:00
abc
11:12:00
abc
11:12:11
abc
the desired output should be 2 (unique values - 11:11:xx and 11:12:xx)
see below (data1.txt is your data)
from collections import defaultdict
data = defaultdict(int)
with open('data1.txt') as f:
lines = [l.strip() for l in f.readlines()]
for line in lines:
if line.count(':') == 2:
data[line[:5]] += 1
print(data)
output
defaultdict(<class 'int'>, {'11:11': 1, '11:12': 2})
You could use re.findall here, followed by a list comprehension to remove duplicates:
with open(path, 'r') as file:
data = file.read()
ts = re.findall(r'(\d{2}:\d{2}):\d{2}', data)
res = []
[res.append(x) for x in ts if x not in res]
print(len(res))
If you only want to count the number of occurences you can simply:
txtfile = open("C:\Users\CL\Desktop\abc.txt", "r")
filetext = txtfile.read()
txtfile.close()
list1 = set(re.findall("(\d{2}:\d{2}):\d{2}",filetext))
total = len(list1) #sum list
print(total) #3
You can use parentheses to specify what you wan to capture (the HH:MM). Then you can use set to remove duplicates.
Have you tried using a set instead of a list?
pattern = re.compile("(\d{2}):(\d{2}):(\d{2})")
path = r'C:\Users\CL\Desktop\abc.txt'
s = set() # use a set instead of a list, to avoid duplicates
for line in open(path,'r'):
for match in re.finditer(pattern, line):
s.add(line[:-3]) #insert into set
total = len(s) #number of elements in s
print(total) #2
This way, if you try to insert an element you've already seen, we won't have multiple copies of it stored, since sets don't allow duplicates.
EDIT: As commented, we are not supposed to include seconds here, which I mistakenly did originally. Fixed now.

How do you make a dictionary out of 2 elements in each list?

I have a list of lists that I want to make into a dictionary. Basically it's a list of births based on date (year/month/day/day of week/births). I want to tally the total births for each day to see in total how many births on each day of the week.
List example:
[2000,12,3,2,12000],[2000,12,4,3,34000]...
days_counts = {1: 23000, 2: 43000, ..., 7: 11943}
Here's the code so far:
f = open('births.csv', 'r')
text = f.read()
text = text.split("\n")
header = text[0]
data = text[1:]
for d in data:
split_data = d.split(",")
print(split_data)
So basically I want to iterate over each day and add the birth from duplicate days into the same key (obviusly).
EDIT: I have to do this with an if statement that looks for the day of week as a key in the dict. if its found, assign the corresponding births as value. If its not in dict then add key and value. I can't import anything or use lambda functions.
Use a collections.Counter() object to track the counts per day-of-the-week. You also want to use the csv module to handle the file parsing:
import csv
from collections import Counter
per_dow = Counter()
with open('births.csv', 'r') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
dow, births = map(int, row[-2:])
per_dow[dow] += births
I've used a with statement to manage the file object; Python auto-closes the file for you when the with block ends.
Now that you have a Counter object (which is a dictionary with some extra powers), you can now find the day of the week with the most births; the following loop prints out days of the week in order from most to least:
for day, births in per_dow.most_common():
print(day, births)
Without using external libraries or if statements, you can use exception handling
birth_dict = {}
birth_list = [[2000,12,3,2,12000],[2000,12,4,3,34000]]
for birth in birth_list:
try:
birth_dict[birth[3]]+=birth[4]
except KeyError:
birth_dict[birth[3]]=birth[4]
print birth_dict
Ok, after playing around with the code and using print statements where I need them for tests, I finally did it without using any external libraries. A very special thanks to Tobey and the others.
Here's the code with tests:
f = open('births.csv', 'r')
text = f.read()
text = text.split("\n")
header = text[0]
data = text[1:-1]
days_counts = {}
for d in data:
r = d.split(",")
print(r) #<--- used to test
k = r[3]
print(k)#<--- used to test
v = int(r[4])
print(v)#<--- used to test
if k in days_counts:
days_counts[k] += v
print("If : " , days_counts)#<--- used to test
else:
days_counts[k] = v
print("Else : ", days_counts)#<--- used to test
print(days_counts)
Code without tests:
f = open('births.csv', 'r')
text = f.read()
text = text.split("\n")
header = text[0]
data = text[1:-1]
days_counts = {}
for d in data:
r = d.split(",")
k = r[3]
v = int(r[4])
if k in days_counts:
days_counts[k] += v
else:
days_counts[k] = v
print(days_counts)

Keyerror with 2d dictionaries

I'm trying to step through a csv and assign date and time values to their own point in a 2d dictionary.This would be in a form such that an instance of:
'11/02/16' and '23:24' in their respective columns in a row would add '1' to the value in the position marked by 'X' in the dictionary 'Dates{11/01/16{23:X}}'.
Unfortunately I get a KeyError for the following code.
import csv
import sys
from sys import argv
from collections import defaultdict
script, ReadFile = argv
f = open(ReadFile,'r')
l = f.readlines()
f.close()
file_list = [row.replace('\n','').split(',') for row in l]
header = file_list[0]
Total = 0
Dates = defaultdict(dict)
print Dates
index_variable = header.index('Time')
index_variable2 = header.index('# Timestamp')
for row in file_list[1:]:
t = row[index_variable][:2]
d = row[index_variable2][:10]
if row[index_variable2][:10] in Dates:
Dates[d][t] = 1
Total += 1
print "true"
else:
Dates[d] = {}
Dates[d][t] = 1
Total =+ 1
print "false"
print Dates
If I replace the local variable 't' with "'Test'" the code works, but obviously the results are not what I'm after.
Thanks in advance!
Update: If I replace 'd' with 'Test' and keep 't' as it is, the program works completely fine. It's only when the Dictionary is specifically called as 'Dates[d][t]' that the program returns a KeyError.
Update 2: I've updated the code above to show my work. Currently the script will work /as long as no numbers are added/.
Dates[d][t] = 1 #If I change this...
Dates[d][t] += 1 #To this...
A KeyError occurs.
Update 3:
I changed a portion of my code...
for row in file_list[1:]:
t = row[index_variable][:2]
d = row[index_variable2][:10]
if d in Dates and t in Dates[d]:
Dates[d][t] += 1
print "true"
else:
Dates[d][t] = 1
print "false"
And now the script works perfectly fine. I suppose that this means the KeyError was because I was not being specific enough (???).
Assuming that what we see above is just bad formatting of the if by the machine...
I think the problem is in the else:
Dates is a dict with various keys.
The d are the first 10 characters of the 'Date' field in your input
You are wanting to count how many times the minutes got hit on a specific Date.
Dates[d] then is a dictionary whose keys are days.
t is supposed to be a dictionary of minutes that got hit on the specific day
You haven't told python that Dates[d] is a dictionary too.
But you've made a reference to Dates[d][t]. This implies that Dates[d] already exists and it has something that is subscriptable in it.
I tried this on my system
import csv
import sys
from sys import argv
from collections import defaultdict
#script, ReadFile = argv
#f = open(ReadFile,'r')
#l = f.readlines()
#f.close()
#file_list = [row.replace('\n','').split(',') for row in l]
#header = file_list[0]
file_list = [['Date','Time','Otherstuff'],
['2016-02-01','23:12:00','Sillyme1'],
['2016-02-01','23:12:04','Sillyme2'],
['2016-02-02','22:10:00','Sillyme3']]
header = file_list[0]
Dates = defaultdict(dict)
print(Dates)
index_variable = header.index('Time')
index_variable2 = header.index('Date')
for row in file_list[1:]:
t = row[index_variable][:2]
d = row[index_variable2][:10]
if d in Dates.keys():
Dates[d][t] +=1
print("true")
else:
Dates[d] = {} #Now Dates[d] contains a dictionary
Dates[d][t] = 1 ##Now we put the first counter in the Dates[d] dictionary with key t.
print(Dates)
Return was:
defaultdict(, {})
true
defaultdict(, {'2016-02-01': {'23': 2}, '2016-02-02': {'22': 1}})

Code doesn't print the last sequence in a file

I have a file that looks like this:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2
<s2> 4
etc. up to more than a thousand
Each sequence has a header like <s0> 3, which in this case states that three lines follow. In the example above, the number of lines below <s1> is two, so I have to correct the header to <s1> 2.
The code I have below picks out the sequence headers and the correct number of lines below them. But for some reason, it never gets the details of the last sequence. I know something is wrong but I don't know what. Can someone point me to what I am doing wrong?
import re
def call():
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
c = 0
c1 = 0
header = []
k = -1
for line in fp:
if line.startswith("<s"):
#header = line.split(" ")
#print header[1]
c = 0
else:
c1 = c + 1
c += 1
if c == 0 and c1>0:
k +=1
printing = c1
if printing >= 0:
s = "<s%s>" % (k)
#print "%s %d" % (s, printing)
docHeader.write(s+" "+str(printing)+"\n")
call()
you have no sentinel at the end of the last sequence in your data, so your code will need to deal with the last sequence AFTER the loop is done.
If I may suggest some python tricks to get to your results; you don't need those c/c1/k counter variables, as they make the code more difficult to read and maintain. Instead, populate a map of sequence header to sequence items and then use the map to do all your work:
(this code works only if all sequence headers are unique - if you have duplicates, it won't work)
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
data = {}
for line in fp:
if line.startswith("<s"):
current_sequence = line
# create a list with the header as the key
data[current_sequence] = []
else:
# add each sequence to the list we defined above
data[current_sequence].append(line)
Your map is ready! It looks like this:
{"<s0> 3": ["line1", "line2", "line5"],
"<s1> 5": ["line1", "line2"]}
You can iterate it like this:
for header, lines in data.items():
# header is the key, or "<s0> 3"
# lines is the list of lines under that header ["line1", "line2", etc]
num_of_lines = len(lines)
The main problem is that you neglect to check the value of c after you have read the last line. You probably had difficulty spotting this problem because of all the superfluous code. You don't have to increment k, since you can extract the value from the <s...> tag. And you don't have to have all three variables c, c1, and printing. A single count variable will do.
import re, sys
def call():
with open('trial_perl.txt') as fp:
docHeader = sys.stdout #open("C:\path\header.txt","w")
count = 0
id = None
for line in fp:
if line.startswith("<s"):
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
count = 0
id = int(line[2:line.find('>')])
else:
count += 1
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
call()
Another approach using groupby from itertools, where you take the maximum number of line in each group - a group corresponding to a sequence of header + line in your file: :
from itertools import groupby
def call():
with open('stack.txt') as fp:
header = [-1]
lines = [0]
for line in fp:
if line.startswith("<s"):
header.append(header[-1]+1)
lines.append(0)
else:
header.append(header[-1])
lines.append(lines[-1] +1)
with open('result','w') as f:
for key, group in groupby(zip(header[1:],lines[1:]), lambda x: x[0]):
f.write(str(("<s%d> %d\n" % max(group))))
f.close()
call()
#<s0> 3
#<s1> 2
stack.txt is the file containing your data:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2

Dictionaries overwriting in Python

This program is to take the grammar rules found in Binary.text and store them into a dictionary, where the rules are:
N = N D
N = D
D = 0
D = 1
but the current code returns D: D = 1, N:N = D, whereas I want N: N D, N: D, D:0, D:1
import sys
import string
#default length of 3
stringLength = 3
#get last argument of command line(file)
filename1 = sys.argv[-1]
#get a length from user
try:
stringLength = int(input('Length? '))
filename = input('Filename: ')
except ValueError:
print("Not a number")
#checks
print(stringLength)
print(filename)
def str2dict(filename="Binary.txt"):
result = {}
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]] = line
print (result)
return result
print (str2dict("Binary.txt"))
Firstly, your data structure of choice is wrong. Dictionary in python is a simple key-to-value mapping. What you'd like is a map from a key to multiple values. For that you'll need:
from collections import defaultdict
result = defaultdict(list)
Next, where are you splitting on '=' ? You'll need to do that in order to get the proper key/value you are looking for? You'll need
key, value = line.split('=', 1) #Returns an array, and gets unpacked into 2 variables
Putting the above two together, you'd go about in the following way:
result = defaultdict(list)
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
key, value = line.split('=', 1)
result[key.strip()].append(value.strip())
return result
Dictionaries, by definition, cannot have duplicate keys. Therefor there can only ever be a single 'D' key. You could, however, store a list of values at that key if you'd like. Ex:
from collections import defaultdict
# rest of your code...
result = defaultdict(list) # Use defaultdict so that an insert to an empty key creates a new list automatically
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]].append(line)
print (result)
return result
This will result in something like:
{"D" : ["D = N D", "D = 0", "D = 1"], "N" : ["N = D"]}

Categories