Dictionaries overwriting in Python - python

This program is to take the grammar rules found in Binary.text and store them into a dictionary, where the rules are:
N = N D
N = D
D = 0
D = 1
but the current code returns D: D = 1, N:N = D, whereas I want N: N D, N: D, D:0, D:1
import sys
import string
#default length of 3
stringLength = 3
#get last argument of command line(file)
filename1 = sys.argv[-1]
#get a length from user
try:
stringLength = int(input('Length? '))
filename = input('Filename: ')
except ValueError:
print("Not a number")
#checks
print(stringLength)
print(filename)
def str2dict(filename="Binary.txt"):
result = {}
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]] = line
print (result)
return result
print (str2dict("Binary.txt"))

Firstly, your data structure of choice is wrong. Dictionary in python is a simple key-to-value mapping. What you'd like is a map from a key to multiple values. For that you'll need:
from collections import defaultdict
result = defaultdict(list)
Next, where are you splitting on '=' ? You'll need to do that in order to get the proper key/value you are looking for? You'll need
key, value = line.split('=', 1) #Returns an array, and gets unpacked into 2 variables
Putting the above two together, you'd go about in the following way:
result = defaultdict(list)
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
key, value = line.split('=', 1)
result[key.strip()].append(value.strip())
return result

Dictionaries, by definition, cannot have duplicate keys. Therefor there can only ever be a single 'D' key. You could, however, store a list of values at that key if you'd like. Ex:
from collections import defaultdict
# rest of your code...
result = defaultdict(list) # Use defaultdict so that an insert to an empty key creates a new list automatically
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]].append(line)
print (result)
return result
This will result in something like:
{"D" : ["D = N D", "D = 0", "D = 1"], "N" : ["N = D"]}

Related

Find specific values in a txt file and adding them up with python

I have a txt file which looks like that:
[Chapter.Title1]
Irrevelent=90 B
Volt=0.10 ienl
Watt=2 W
Ampere=3 A
Irrevelent=91 C
[Chapter.Title2]
Irrevelent=999
Irrevelent=999
[Chapter.Title3]
Irrevelent=92 B
Volt=0.20 ienl
Watt=5 W
Ampere=6 A
Irrevelent=93 C
What I want is that it catches "Title1" and the values "0,1", "2" and "3". Then adds them up (which would be 5.1).
I don't care about the lines with "irrevelent" at the beginning.
And then the same with the third block. Catching "Title3" and adding "0.2", "5" and "6".
The second block with "Title2" does not contain "Volt", Watt" and "Ampere" and is therefore not relevant.
Can anyone please help me out with this?
Thank you and cheers
You can use regular expressions to get the values and the titles in lists, then use them.
txt = """[Chapter.Title1]
Irrevelent=90 B
Volt=1 V
Watt=2 W
Ampere=3 A
Irrevelent=91 C
[Chapter.Title2]
Irrevelent=92 B
Volt=4 V
Watt=5 W
Ampere=6 A
Irrevelent=93 C"""
#that's just the text
import re
rx1=r'Chapter.(.*?)\]'
rxv1=r'Volt=(\d+)'
rxv2=r'Watt=(\d+)'
rxv3=r'Ampere=(\d+)'
res1 = re.findall(rx1, txt)
resv1 = re.findall(rxv1, txt)
resv2 = re.findall(rxv2, txt)
resv3 = re.findall(rxv3, txt)
print(res1)
print(resv1)
print(resv2)
print(resv3)
Here you get the titles and the interesting values you want :
['Title1', 'Title2']
['1', '4']
['2', '5']
['3', '6']
You can then use them as you want, for example :
for title_index in range(len(res1)):
print(res1[title_index])
value=int(resv1[title_index])+int(resv2[title_index])+int(resv3[title_index])
#use float() instead of int() if you have non integer values
print("the value is:", value)
You get :
Title1
the value is: 6
Title2
the value is: 15
Or you can store them in a dictionary or an other structure, for example :
#dict(zip(keys, values))
data= dict(zip(res1, [int(resv1[i])+int(resv2[i])+int(resv3[i]) for i in range(len(res1))] ))
print(data)
You get :
{'Title1': 6, 'Title2': 15}
Edit : added opening of the file
import re
with open('filename.txt', 'r') as file:
txt = file.read()
rx1=r'Chapter.(.*?)\]'
rxv1=r'Volt=([0-9]+(?:\.[0-9]+)?)'
rxv2=r'Watt=([0-9]+(?:\.[0-9]+)?)'
rxv3=r'Ampere=([0-9]+(?:\.[0-9]+)?)'
res1 = re.findall(rx1, txt)
resv1 = re.findall(rxv1, txt)
resv2 = re.findall(rxv2, txt)
resv3 = re.findall(rxv3, txt)
data= dict(zip(res1, [float(resv1[i])+float(resv2[i])+float(resv3[i]) for i in range(len(res1))] ))
print(data)
Edit 2 : ignoring missing values
import re
with open('filename.txt', 'r') as file:
txt = file.read()
#divide the text into parts starting with "chapter"
substr = "Chapter"
chunks_idex = [_.start() for _ in re.finditer(substr, txt)]
chunks = [txt[chunks_idex[i]:chunks_idex[i+1]-1] for i in range(len(chunks_idex)-1)]
chunks.append(txt[chunks_idex[-1]:]) #add the last chunk
#print(chunks)
keys=[]
values=[]
rx1=r'Chapter.(.*?)\]'
rxv1=r'Volt=([0-9]+(?:\.[0-9]+)?)'
rxv2=r'Watt=([0-9]+(?:\.[0-9]+)?)'
rxv3=r'Ampere=([0-9]+(?:\.[0-9]+)?)'
for chunk in chunks:
res1 = re.findall(rx1, chunk)
resv1 = re.findall(rxv1, chunk)
resv2 = re.findall(rxv2, chunk)
resv3 = re.findall(rxv3, chunk)
# check if we can find all of them by checking if the lists are not empty
if res1 and resv1 and resv2 and resv3 :
keys.append(res1[0])
values.append(float(resv1[0])+float(resv2[0])+float(resv3[0]))
data= dict(zip(keys, values ))
print(data)
Here's a quick and dirty way to do this, reading line by line, if the input file is predictable enough.
In the example I just print out the titles and the values; you can of course process them however you want.
f = open('file.dat','r')
for line in f.readlines():
## Catch the title of the line:
if '[Chapter' in line:
print(line[9:-2])
## catch the values of Volt, Watt, Amere parameters
elif line[:4] in ['Volt','Watt','Ampe']:
value = line[line.index('=')+1:line.index(' ')]
print(value)
## if line is "Irrelevant", or blank, do nothing
f.close()
There are many ways to achieve this. Here's one:
d = dict()
V = {'Volt', 'Watt', 'Ampere'}
with open('chapter.txt', encoding='utf-8') as f:
key = None
for line in f:
if line.startswith('[Chapter'):
d[key := line.strip()] = 0
elif key and len(t := line.split('=')) > 1 and t[0] in V:
d[key] += float(t[1].split()[0])
for k, v in d.items():
if v > 0:
print(f'Total for {k} = {v}')
Output:
Total for [Chapter.Title1] = 6
Total for [Chapter.Title2] = 15

How to print out lines longer than specific lenght

I have an input file like this:
#sample1
ATGGTTCCAAGGCCTTGGTTAATTGGGGGGTTTTTTTTTTTTTTTTTTT
#sample2
TTGGAACCTTGGCCAATTAAGGGGGGGGGTTTTTTTCCCCCCCCCCCCC
#sample3
GGTTGGTTGGGAATTTGGTTAACCTTTTTAAATTTTTTTTTTTGGGGGG
AATTTTTTTTTTTTTGG
I want to print out the line that have specific minimum length. For example, if the minimum length I want is 66, then the output will be :
#sample3
GGTTGGTTGGGAATTTGGTTAACCTTTTTAAATTTTTTTTTTTGGGGGG
AATTTTTTTTTTTTTGG
Since only the sequence of sample 3 have the minimum length 66
Below is my code sofar:
fastfile = {}
with open(sys.argv[1]) as f:
for line in f:
line = line.strip()
if not line:
continue
if line.startswith("#"):
sequencenumber = line[1:]
if sequencenumber not in fastfile:
fastfile[sequencenumber] = []
continue
sequence = line
fastfile[sequencenumber].append(sequence)
output = []
for key, value in fastfile.items():
if len(value) >= sys.argv[2]:
output.append(value)
print (output)
Argv[1] is the path of the input file and argv[2] is the specific minimum length.
You want the values of the fastfile dictionary to be strings not lists, so instead of appending consecutive sequences to a running list, you need to concatenating them to a running string:
fastfile = {}
with open(sys.argv[1]) as f:
for line in f:
line = line.strip()
if not line:
continue
if line[0] == "#":
sequencenumber = line[1:]
if sequencenumber not in fastfile:
fastfile[sequencenumber] = ""
continue
fastfile[sequencenumber] += line
output = []
for key, value in fastfile.items():
if len(value) >= sys.argv[2]:
output.append(value)
print (output)
Or if you need to store the strings in a list like you originally do, then use "".join(value) to concatenate all the strings together, like so:
output = []
for key, value in fastfile.items():
if len("".join(value)) >= sys.argv[2]:
output.append("".join(value))
output
This looks much simpler:
with open(argv[1]) as fin :
text = fin.read()
min_length = int(argv[2])
parts = text.split('#')
# choose only the parts that have strings over the min_length
parts = [p for p in parts if any(len(i) > min_length for i in p.split('\n'))]
output = '#'.join( parts )

Python File IO - building dictionary and finding max value

Problem is to return the name of the event that has the highest number of participants in this text file:
#Beyond the Imposter Syndrome
32 students
4 faculty
10 industries
#Diversifying Computing Panel
15 students
20 faculty
#Movie Night
52 students
So I figured I had to split it into a dictionary with the keys as the event names and the values as the sum of the integers at the beginning of the other lines. I'm having a lot of trouble and I think I'm making it too complicated than it is.
This is what I have so far:
def most_attended(fname):
'''(str: filename, )'''
d = {}
f = open(fname)
lines = f.read().split(' \n')
print lines
indexes = []
count = 0
for i in range(len(lines)):
if lines[i].startswith('#'):
event = lines[i].strip('#').strip()
if event not in d:
d[event] = []
print d
indexes.append(i)
print indexes
if not lines[i].startswith('#') and indexes !=0:
num = lines[i].strip().split()[0]
print num
if num not in d[len(d)-1]:
d[len(d)-1] += [num]
print d
f.close()
import sys
from collections import defaultdict
from operator import itemgetter
def load_data(file_name):
events = defaultdict(int)
current_event = None
for line in open(file_name):
if line.startswith('#'):
current_event = line[1:].strip()
else:
participants_count = int(line.split()[0])
events[current_event] += participants_count
return events
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage:\n\t{} <file>\n'.format(sys.argv[0]))
else:
events = load_data(sys.argv[1])
print('{}: {}'.format(*max(events.items(), key=itemgetter(1))))
Here's how I would do it.
with open("test.txt", "r") as f:
docText = f.read()
eventsList = []
#start at one because we don't want what's before the first #
for item in docText.split("#")[1:]:
individualLines = item.split("\n")
#get the sum by finding everything after the name, name is the first line here
sumPeople = 0
#we don't want the title
for line in individualLines[1:]:
if not line == "":
sumPeople += int(line.split(" ")[0]) #add everything before the first space to the sum
#add to the list a tuple with (eventname, numpeopleatevent)
eventsList.append((individualLines[0], sumPeople))
#get the item in the list with the max number of people
print(max(eventsList, key=lambda x: x[1]))
Essentially you first want to split up the document by #, ignoring the first item because that's always going to be empty. Now you have a list of events. Now for each event you have to go through, and for every additional line in that event (except the first) you have to add that lines value to the sum. Then you create a list of tuples like (eventname) (numPeopleAtEvent). Finally you use max() to get the item with the maximum number of people.
This code prints ('Movie Night', 104) obviously you can format it to however you like
Similar answers to the ones above.
result = {} # store the results
current_key = None # placeholder to hold the current_key
for line in lines:
# find what event we are currently stripping data for
# if this line doesnt start with '#', we can assume that its going to be info for the last seen event
if line.startswith("#"):
current_key = line[1:]
result[current_key] = 0
elif current_key:
# pull the number out of the string
number = [int(s) for s in line.split() if s.isdigit()]
# make sure we actually got a number in the line
if len(number) > 0:
result[current_key] = result[current_key] + number[0]
print(max(result, key=lambda x: x[1]))
This will print "Movie Night".
Your problem description says that you want to find the event with highest number of participants. I tried a solution which does not use list or dictionary.
Ps: I am new to Python.
bigEventName = ""
participants = 0
curEventName = ""
curEventParticipants = 0
# Use RegEx to split the file by lines
itr = re.finditer("^([#\w+].*)$", lines, flags = re.MULTILINE)
for m in itr:
if m.group(1).startswith("#"):
# Whenever a new group is encountered, check if the previous sum of
# participants is more than the recent event. If so, save the results.
if curEventParticipants > participants:
participants = curEventParticipants
bigEventName = curEventName
# Reset the current event name and sum as 0
curEventName = m.group(1)[1:]
curEventParticipants = 0
elif re.match("(\d+) .*", m.group(1)):
# If it is line which starts with number, extract the number and sum it
curEventParticipants += int(re.search("(\d+) .*", m.group(1)).group(1))
# This nasty code is needed to take care of the last event
bigEventName = curEventName if curEventParticipants > participants else bigEventName
# Here is the answer
print("Event: ", bigEventName)
You can do it without a dictionary and maybe make it a little simpler if just using lists:
with open('myfile.txt', 'r') as f:
lines = f.readlines()
lines = [l.strip() for l in lines if l[0] != '#'] # remove comment lines and '\n'
highest = 0
event = ""
for l in lines:
l = l.split()
if int(l[0]) > highest:
highest = int(l[0])
event = l[1]
print (event)

Code doesn't print the last sequence in a file

I have a file that looks like this:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2
<s2> 4
etc. up to more than a thousand
Each sequence has a header like <s0> 3, which in this case states that three lines follow. In the example above, the number of lines below <s1> is two, so I have to correct the header to <s1> 2.
The code I have below picks out the sequence headers and the correct number of lines below them. But for some reason, it never gets the details of the last sequence. I know something is wrong but I don't know what. Can someone point me to what I am doing wrong?
import re
def call():
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
c = 0
c1 = 0
header = []
k = -1
for line in fp:
if line.startswith("<s"):
#header = line.split(" ")
#print header[1]
c = 0
else:
c1 = c + 1
c += 1
if c == 0 and c1>0:
k +=1
printing = c1
if printing >= 0:
s = "<s%s>" % (k)
#print "%s %d" % (s, printing)
docHeader.write(s+" "+str(printing)+"\n")
call()
you have no sentinel at the end of the last sequence in your data, so your code will need to deal with the last sequence AFTER the loop is done.
If I may suggest some python tricks to get to your results; you don't need those c/c1/k counter variables, as they make the code more difficult to read and maintain. Instead, populate a map of sequence header to sequence items and then use the map to do all your work:
(this code works only if all sequence headers are unique - if you have duplicates, it won't work)
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
data = {}
for line in fp:
if line.startswith("<s"):
current_sequence = line
# create a list with the header as the key
data[current_sequence] = []
else:
# add each sequence to the list we defined above
data[current_sequence].append(line)
Your map is ready! It looks like this:
{"<s0> 3": ["line1", "line2", "line5"],
"<s1> 5": ["line1", "line2"]}
You can iterate it like this:
for header, lines in data.items():
# header is the key, or "<s0> 3"
# lines is the list of lines under that header ["line1", "line2", etc]
num_of_lines = len(lines)
The main problem is that you neglect to check the value of c after you have read the last line. You probably had difficulty spotting this problem because of all the superfluous code. You don't have to increment k, since you can extract the value from the <s...> tag. And you don't have to have all three variables c, c1, and printing. A single count variable will do.
import re, sys
def call():
with open('trial_perl.txt') as fp:
docHeader = sys.stdout #open("C:\path\header.txt","w")
count = 0
id = None
for line in fp:
if line.startswith("<s"):
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
count = 0
id = int(line[2:line.find('>')])
else:
count += 1
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
call()
Another approach using groupby from itertools, where you take the maximum number of line in each group - a group corresponding to a sequence of header + line in your file: :
from itertools import groupby
def call():
with open('stack.txt') as fp:
header = [-1]
lines = [0]
for line in fp:
if line.startswith("<s"):
header.append(header[-1]+1)
lines.append(0)
else:
header.append(header[-1])
lines.append(lines[-1] +1)
with open('result','w') as f:
for key, group in groupby(zip(header[1:],lines[1:]), lambda x: x[0]):
f.write(str(("<s%d> %d\n" % max(group))))
f.close()
call()
#<s0> 3
#<s1> 2
stack.txt is the file containing your data:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2

Building a dictionary using two files

I am very very new to python and I have been playing around to write a script using two files. File 1 contains a number of ID numbers such as:
1000012
1000015
1000046
1000047
1000050
1000072
1000076
100008
1000102
100013
The other file has few lines of single ID numbers followed by lines made of one ID number followed by other ID numbers which have a + or - at the end:
951450
8951670
8951800
8951863
8951889
9040311
9255087 147+ 206041- 8852164- 4458078- 1424812- 3631438- 8603144+ 4908786- 4780663+ 4643406+ 3061176- 7523696- 5876052- 163881- 6234800- 395660-
9255088 149+ 7735585+ 6359867+ 620034- 4522360- 2810885- 3705265+ 5966368- 7021344+ 9165926- 2477382+ 4015358- 2497281+ 9166415+ 6837601-
9255089 217+ 6544241+ 5181434+ 4625589+ 7433598+ 7295233+ 3938917+ 4109401+ 2135539+ 4960823+ 1838531+ 1959852+ 5698864+ 1925066+ 8212560+ 3056544+ 82N 1751642+ 4772695+ 2396528+ 2673866+ 2963754+ 5087444+ 977167+ 2892617- 7412278- 6920479- 2539680- 4315259- 8899799- 733101- 5281901- 7055760+ 8508290+ 8559218+ 7985985+ 6391093+ 2483783+ 8939632+ 3373919- 924346+ 1618865- 8670617+ 515619+ 5371996+ 2152211+ 6337329+ 284813+ 8512064+ 3469059+ 3405322+ 1415471- 1536881- 8034033+ 4592921+ 4226887- 6578783-
I want to build a dictionary using these two files. My script has to search inside File 2 for the ID numbers in File 1 and append those lines as values in which the key is the number in File 1. Therefore there may be more than one value for each key. I only want to search the lines in File 2 that have more than one number (if len(x) > 1).
the output would be something like: 1000047: 9292540 1000047+ 9126889+ 3490727- 8991434+ 4296324+ 9193432- 3766395+ 9193431+ 8949379- (I need to print each ID number in File1 as the key and as its value, the chunk of lines that contain that ID number as a whole)
Here is my -very wrong- script:
#!/usr/bin/python
f = open('file1')
z = open('file2')
d = dict() # d is an empty dictionary
for l in f:
p = l.rstrip()
d[p] = list() # sets the keys in the dictionary as p (IDs with newline characters stripped)
y = z.readlines() # retrieves a string from the path file
s = "".join(y) # makes a string from y
x = str.split(s) #splits the path file at white spaces
if len(x) > 1: # only the lines that include contigs IDs that were used to make another contig
for lines in y:
k = lines.rstrip()
w = tuple(x) # convert list x into a tuple called w
for i in w:
if i[:-1] in d:
d[p].append(k)
print d
Try:
#!/usr/bin/python
f = open('file1')
z = open('file2')
d = dict() # d is an empty dictionary
for l in f:
p = l.rstrip()
d[p] = list() # Change #1
f.close()
# Now we have a dictinary with the keys from file1 and empty lists as values
for line in z:
items = item.split() # items will be a list from 1 line
if len(items) > 1: # more than initial item in the list
k = items[0] # First is the key line
for i in items[1:]: # rest of items
if d.haskey(i[:-1]): # is it in the dict
d[i].append(k) # Add the k value
z.close()
print d
N.B. This is untested code but shouldn't be too far off.
Is this what you are looking for ?? (I have not tested it ...)
#!/usr/bin/python
f = open('file1')
z = open('file2')
d = dict() # d is an empty dictionary
for l in f.readlines():
for l2 in z.readlines():
if l.rstrip() in l2.rstrip():
d[l] = l2
z.seek(0, 0)
f.close()
z.close()
Here is a simpler version the same code, if you don't want to deal with the file pointer
f = open("file1")
z = open("file2")
d = dict() # d is an empty dictionary
file1_lines = f.readlines()
file2_lines = z.readlines()
for l in file1_lines:
for l2 in file2_lines:
if l.rstrip() in l2.rstrip():
d[l] = l2
print d
f.close()
z.close()

Categories