How to extract key and value from a text file - python

I would like to extract key and value from an existing text file. Key in a separate variable and value in a separate variable.
The text file (sample.txt) contains the below content,
one:two
three:four
five:six
seven:eight
nine:ten
sample:demo
I am able to read the content from the text file, but i am not able to proceed further to extract key and value.
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
x=0
for line in sp:
sp.split(":")[x].strip()
x+=1
The above only extracts the value and also provides index out of range exception at the end.
If we iterate through the file, i am expecting the output as below,
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
key 4 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten

This should work:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
lines = sp.split("\n")
for line in lines:
# print("line:[{0}]".format(line))
parts = line.split(":")
print("key:[{0}], value:[{1}]".format(parts[0], parts[1]))

It can work:
sp = open ("sampletxt.txt", "r")
x=0
key=[]
value=[]
try:
while True:
text_line = sp.readline()
if text_line:
text_line = ''.join(text_line)
text_line = text_line.split()
text_line = ''.join(text_line).split(':')
key.append(text_line[0])
value.append(text_line[1])
x += 1
else:
for i in range(x):
print("Key {} = {}".format(i,key[i]))
print("")
for i in range(x):
print("Value {} = {}".format(i,value[i]))
break
finally:
sp.close()
The output is:
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
Key 4 = nine
Key 5 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten
Value 5 = demo
which is similar to your request

Why don't you try:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
dictionary = {}
for x, line in enumerate(sp):
line_list = sp.split(":")
dictionary[line_list[0]]=line_list[1]

You should always check if split returns two members (or any number you expect) before using the indexes.

Related

Python: How to read space delimited data with different length in text file and parse it

I have space delimited data in a text file look like the following:
0 1 2 3
1 2 3
3 4 5 6
1 3 5
1
2 3 5
3 5
each line has different length.
I need to read it starting from line 2 ('1 2 3')
and parse it and get the following information:
Number of unique data = (1,2,3,4,5,6)=6
Count of each data:
count data (1)=3
count data (2)=2
count data (3)=5
count data (4)=1
count data (5)=4
count data (6)=1
Number of lines=6
Sort the data in descending order:
data (3)
data (5)
data (1)
data (2)
data (4)
data (6)
I did this:
file=open('data.txt')
csvreader=csv.reader(file)
header=[]
header=next(csvreader)
print(header)
rows=[]
for row in csvreader:
rows.append(row)
print(rows)
After this step, what should I do to get the expected results?
I would do something like this:
from collections import Counter
with open('data.txt', 'r') as file:
lines = file.readlines()
lines = lines[1:] # skip first line
data = []
for line in lines:
data += line.strip().split(" ")
counter = Counter(data)
print(f'unique data: {list(counter.keys())}')
print(f'count data: {list(sorted(counter.most_common(), key=lambda x: x[0]))}')
print(f'number of lines: {len(lines)}')
print(f'sort data: {[x[0] for x in counter.most_common()]}')
A simple brute force approach:
nums = []
counts = {}
for row in open('data.txt'):
if row[0] == '0':
continue
nums.extend( [int(k) for k in row.rstrip().split()] )
print(nums)
for n in nums:
if n not in counts:
counts[n] = 1
else:
counts[n] += 1
print(counts)
ordering = list(sorted(counts.items(), key=lambda k: -k[1]))
print(ordering)
Here is another approach
def getData(infile):
""" Read file lines and return lines 1 thru end"""
lnes = []
with open(infile, 'r') as data:
lnes = data.readlines()
return lnes[1:]
def parseData(ld):
""" Parse data and print desired results """
unique_symbols = set()
all_symbols = dict()
for l in ld:
symbols = l.strip().split()
for s in symbols:
unique_symbols.add(s)
cnt = all_symbols.pop(s, 0)
cnt += 1
all_symbols[s] = cnt
print(f'Number of Unique Symbols = {len(unique_symbols)}')
print(f'Number of Lines Processed = {len(ld)}')
for symb in unique_symbols:
print(f'Number of {symb} = {all_symbols[symb]}')
print(f"Descending Sort of Symbols = {', '.join(sorted(list(unique_symbols), reverse=True))}")
On executing:
infile = r'spaced_text.txt'
parseData(getData(infile))
Produces:
Number of Unique Symbols = 6
Number of Lines Processed = 6
Number of 2 = 2
Number of 5 = 4
Number of 3 = 5
Number of 1 = 3
Number of 6 = 1
Number of 4 = 1
Descending Sort of Symbols = 6, 5, 4, 3, 2, 1

Python File IO - building dictionary and finding max value

Problem is to return the name of the event that has the highest number of participants in this text file:
#Beyond the Imposter Syndrome
32 students
4 faculty
10 industries
#Diversifying Computing Panel
15 students
20 faculty
#Movie Night
52 students
So I figured I had to split it into a dictionary with the keys as the event names and the values as the sum of the integers at the beginning of the other lines. I'm having a lot of trouble and I think I'm making it too complicated than it is.
This is what I have so far:
def most_attended(fname):
'''(str: filename, )'''
d = {}
f = open(fname)
lines = f.read().split(' \n')
print lines
indexes = []
count = 0
for i in range(len(lines)):
if lines[i].startswith('#'):
event = lines[i].strip('#').strip()
if event not in d:
d[event] = []
print d
indexes.append(i)
print indexes
if not lines[i].startswith('#') and indexes !=0:
num = lines[i].strip().split()[0]
print num
if num not in d[len(d)-1]:
d[len(d)-1] += [num]
print d
f.close()
import sys
from collections import defaultdict
from operator import itemgetter
def load_data(file_name):
events = defaultdict(int)
current_event = None
for line in open(file_name):
if line.startswith('#'):
current_event = line[1:].strip()
else:
participants_count = int(line.split()[0])
events[current_event] += participants_count
return events
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage:\n\t{} <file>\n'.format(sys.argv[0]))
else:
events = load_data(sys.argv[1])
print('{}: {}'.format(*max(events.items(), key=itemgetter(1))))
Here's how I would do it.
with open("test.txt", "r") as f:
docText = f.read()
eventsList = []
#start at one because we don't want what's before the first #
for item in docText.split("#")[1:]:
individualLines = item.split("\n")
#get the sum by finding everything after the name, name is the first line here
sumPeople = 0
#we don't want the title
for line in individualLines[1:]:
if not line == "":
sumPeople += int(line.split(" ")[0]) #add everything before the first space to the sum
#add to the list a tuple with (eventname, numpeopleatevent)
eventsList.append((individualLines[0], sumPeople))
#get the item in the list with the max number of people
print(max(eventsList, key=lambda x: x[1]))
Essentially you first want to split up the document by #, ignoring the first item because that's always going to be empty. Now you have a list of events. Now for each event you have to go through, and for every additional line in that event (except the first) you have to add that lines value to the sum. Then you create a list of tuples like (eventname) (numPeopleAtEvent). Finally you use max() to get the item with the maximum number of people.
This code prints ('Movie Night', 104) obviously you can format it to however you like
Similar answers to the ones above.
result = {} # store the results
current_key = None # placeholder to hold the current_key
for line in lines:
# find what event we are currently stripping data for
# if this line doesnt start with '#', we can assume that its going to be info for the last seen event
if line.startswith("#"):
current_key = line[1:]
result[current_key] = 0
elif current_key:
# pull the number out of the string
number = [int(s) for s in line.split() if s.isdigit()]
# make sure we actually got a number in the line
if len(number) > 0:
result[current_key] = result[current_key] + number[0]
print(max(result, key=lambda x: x[1]))
This will print "Movie Night".
Your problem description says that you want to find the event with highest number of participants. I tried a solution which does not use list or dictionary.
Ps: I am new to Python.
bigEventName = ""
participants = 0
curEventName = ""
curEventParticipants = 0
# Use RegEx to split the file by lines
itr = re.finditer("^([#\w+].*)$", lines, flags = re.MULTILINE)
for m in itr:
if m.group(1).startswith("#"):
# Whenever a new group is encountered, check if the previous sum of
# participants is more than the recent event. If so, save the results.
if curEventParticipants > participants:
participants = curEventParticipants
bigEventName = curEventName
# Reset the current event name and sum as 0
curEventName = m.group(1)[1:]
curEventParticipants = 0
elif re.match("(\d+) .*", m.group(1)):
# If it is line which starts with number, extract the number and sum it
curEventParticipants += int(re.search("(\d+) .*", m.group(1)).group(1))
# This nasty code is needed to take care of the last event
bigEventName = curEventName if curEventParticipants > participants else bigEventName
# Here is the answer
print("Event: ", bigEventName)
You can do it without a dictionary and maybe make it a little simpler if just using lists:
with open('myfile.txt', 'r') as f:
lines = f.readlines()
lines = [l.strip() for l in lines if l[0] != '#'] # remove comment lines and '\n'
highest = 0
event = ""
for l in lines:
l = l.split()
if int(l[0]) > highest:
highest = int(l[0])
event = l[1]
print (event)

Code doesn't print the last sequence in a file

I have a file that looks like this:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2
<s2> 4
etc. up to more than a thousand
Each sequence has a header like <s0> 3, which in this case states that three lines follow. In the example above, the number of lines below <s1> is two, so I have to correct the header to <s1> 2.
The code I have below picks out the sequence headers and the correct number of lines below them. But for some reason, it never gets the details of the last sequence. I know something is wrong but I don't know what. Can someone point me to what I am doing wrong?
import re
def call():
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
c = 0
c1 = 0
header = []
k = -1
for line in fp:
if line.startswith("<s"):
#header = line.split(" ")
#print header[1]
c = 0
else:
c1 = c + 1
c += 1
if c == 0 and c1>0:
k +=1
printing = c1
if printing >= 0:
s = "<s%s>" % (k)
#print "%s %d" % (s, printing)
docHeader.write(s+" "+str(printing)+"\n")
call()
you have no sentinel at the end of the last sequence in your data, so your code will need to deal with the last sequence AFTER the loop is done.
If I may suggest some python tricks to get to your results; you don't need those c/c1/k counter variables, as they make the code more difficult to read and maintain. Instead, populate a map of sequence header to sequence items and then use the map to do all your work:
(this code works only if all sequence headers are unique - if you have duplicates, it won't work)
with open('trial_perl.txt') as fp:
docHeader = open("C:\path\header.txt","w")
data = {}
for line in fp:
if line.startswith("<s"):
current_sequence = line
# create a list with the header as the key
data[current_sequence] = []
else:
# add each sequence to the list we defined above
data[current_sequence].append(line)
Your map is ready! It looks like this:
{"<s0> 3": ["line1", "line2", "line5"],
"<s1> 5": ["line1", "line2"]}
You can iterate it like this:
for header, lines in data.items():
# header is the key, or "<s0> 3"
# lines is the list of lines under that header ["line1", "line2", etc]
num_of_lines = len(lines)
The main problem is that you neglect to check the value of c after you have read the last line. You probably had difficulty spotting this problem because of all the superfluous code. You don't have to increment k, since you can extract the value from the <s...> tag. And you don't have to have all three variables c, c1, and printing. A single count variable will do.
import re, sys
def call():
with open('trial_perl.txt') as fp:
docHeader = sys.stdout #open("C:\path\header.txt","w")
count = 0
id = None
for line in fp:
if line.startswith("<s"):
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
count = 0
id = int(line[2:line.find('>')])
else:
count += 1
if id != None:
tag = '<s%s>' % id
docHeader.write('<s%d> %d\n' % (id, count))
call()
Another approach using groupby from itertools, where you take the maximum number of line in each group - a group corresponding to a sequence of header + line in your file: :
from itertools import groupby
def call():
with open('stack.txt') as fp:
header = [-1]
lines = [0]
for line in fp:
if line.startswith("<s"):
header.append(header[-1]+1)
lines.append(0)
else:
header.append(header[-1])
lines.append(lines[-1] +1)
with open('result','w') as f:
for key, group in groupby(zip(header[1:],lines[1:]), lambda x: x[0]):
f.write(str(("<s%d> %d\n" % max(group))))
f.close()
call()
#<s0> 3
#<s1> 2
stack.txt is the file containing your data:
<s0> 3
line1
line2
line3
<s1> 5
line1
line2

Dictionaries overwriting in Python

This program is to take the grammar rules found in Binary.text and store them into a dictionary, where the rules are:
N = N D
N = D
D = 0
D = 1
but the current code returns D: D = 1, N:N = D, whereas I want N: N D, N: D, D:0, D:1
import sys
import string
#default length of 3
stringLength = 3
#get last argument of command line(file)
filename1 = sys.argv[-1]
#get a length from user
try:
stringLength = int(input('Length? '))
filename = input('Filename: ')
except ValueError:
print("Not a number")
#checks
print(stringLength)
print(filename)
def str2dict(filename="Binary.txt"):
result = {}
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]] = line
print (result)
return result
print (str2dict("Binary.txt"))
Firstly, your data structure of choice is wrong. Dictionary in python is a simple key-to-value mapping. What you'd like is a map from a key to multiple values. For that you'll need:
from collections import defaultdict
result = defaultdict(list)
Next, where are you splitting on '=' ? You'll need to do that in order to get the proper key/value you are looking for? You'll need
key, value = line.split('=', 1) #Returns an array, and gets unpacked into 2 variables
Putting the above two together, you'd go about in the following way:
result = defaultdict(list)
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
key, value = line.split('=', 1)
result[key.strip()].append(value.strip())
return result
Dictionaries, by definition, cannot have duplicate keys. Therefor there can only ever be a single 'D' key. You could, however, store a list of values at that key if you'd like. Ex:
from collections import defaultdict
# rest of your code...
result = defaultdict(list) # Use defaultdict so that an insert to an empty key creates a new list automatically
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]].append(line)
print (result)
return result
This will result in something like:
{"D" : ["D = N D", "D = 0", "D = 1"], "N" : ["N = D"]}

Index Error: Index out of bounds when using numpy in python

I have a code that works fine when I have small CSV's of data but errors out when I try to run large CSV's through it. In essence this code is supposed to place 3 CSV's worth of data into 3 separate dictionaries, combine those dictionaries into a master dictionary, and then preform arithmetic operations on dictionary. The input CSV's look something like this:
time A B C D
0 3 4 6 4
.001 4 6 7 8
.002 4 6 7 3
The code that I am using is the code displayed below. The error occurs within the lines 47 and 65 where I am try to preform arithmetic with the dictionary. Any explanation as to why this is going on is greatly appreciated.
import numpy
Xcoord = {}
time = []
with open ('Nodal_QuardnetsX2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Xcoord[values[0]] = map(float, values[1:])
time.append(values[0])
Ycoord = {}
with open ('Nodal_QuardnetsY2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Ycoord[values[0]] = map(float, values[1:])
Zcoord = {}
with open ('Nodal_QuardnetsZ2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Zcoord[values[0]] = map(float, values[1:])
# Create a master dictionary of the form {'key':[[x, y, z], [x, y, z]}
CoordCombo = {}
for key in Xcoord.keys():
CoordnateList = zip(Xcoord[key], Ycoord[key], Zcoord[key])
CoordCombo[key] = CoordnateList
counter = 0
keycount1 = 0
keycount2 = 0.001
difference = []
NodalDisplacements = {}
#Find the difference between the x, y, and z quardnets relative to that point in time
while keycount2 <= float(values[0]):
Sub = numpy.subtract(CoordCombo[str(keycount2)][counter], CoordCombo[str(keycount1)][counter])
counter = counter + 1
difference.append(Sub)
NodalDisplacements[keycount1] = Sub
keycount1 = keycount1 + 0.001
keycount2 = keycount2 + 0.001
counter = 0
keycount3 = 0
keycount4 = 0.001
Sum = []
breakpoint = float(values[0])-0.001
while keycount4 <= breakpoint:
Add = numpy.sum(NodalDisplacements[keycount4][counter], NodalDisplacements[keycount3][counter])
Sum.append(Add)
keycount3 = keycount3 + 0.001
keycount4 = keycount4 + 0.001
counter = counter + 1
if counter == 2:
counter = 0
print Sum
probably a line of your csv file does not contain 5 elements or the line is empty.
In your logic I would suggest to use
for line in f:
line = line.strip()
if not line: continue
if len(values) != N_COLS: continue # or error...
# other ...

Categories