how to read a specific line which starts in "#" from file in python - python

how can i read a specific line which starts in "#" from file in python and
set that line as a key in a dictionary (without the "#") and set all the lines after that line until the next "#" as a value is the dictionary
please help me
here is the file :

from collections import defaultdict
key = 'NOKEY'
d = defaultdict(list)
with open('thefile.txt', 'r') as f:
for line in f:
if line.startswith('#'):
key = line.replace('#', '')
continue
d[key].append(line)
Your dictionary will have a list of lines under each key. All lines that come before the first line starting with '#' would be stored under the key 'NOKEY'.

You could make use of Python's groupby function as follows:
from itertools import groupby
d = {}
key = ''
with open('input.txt', 'r') as f_input:
for k, g in groupby(f_input, key=lambda x: x[0] == '#'):
if k:
key = next(g).strip(' #\n')
else:
d[key] = ''.join(g)
print d
This would give you the following kind of output:
{'The Piper at the gates of dawn': '*Lucifer sam....\nsksdlkdfslkj\ndkdkfjoiupoeri\nlkdsjforinewonre\n', 'A Saucerful of Secrets': '*Let there be\nPeople heard him say'}
Tested using Python 2.7.9

A pretty simple version
filename = 'test'
results = {}
with open(filename, 'r') as f:
while (1):
text = f.readline()
if (text == ''):
break
elif (text[0] == "#"):
key = text
results[key] = ''
else:
results[key] += text
From (ignoring additional blank lines, a bi-product of the Answer formatting):
#The Piper at the gates of dawn
*Lucifer sam....
sksdlkdfslkj
dkdkfjoiupoeri
lkdsjforinewonre
# A Saucerful of Secrets
*Let there be
People heard him say
Produces:
{'#The Piper at the gates of dawn\n': '*Lucifer sam....\nsksdlkdfslkj\ndkdkfjoiupoeri\nlkdsjforinewonre\n', '# A Saucerful of Secrets \n': '*Let there be\nPeople heard him say\n'}

Related

How to remove Duplicates in .txt file

I have a .txt file with the below entries:-
Apples 51824
Oranges 131236
Peaches 6564
Apples 5879
Peaches 69878
I am trying to remove the entire row (when duplicate entries are found) from this file whenever a word (say Apples) matches in that row (keeping in mind that the entry with the highest value stays).
What I presently do:-
1. Open the file in Excel.
2. Go to Data --> Remove Duplicates
The issue with this approach according to me is that I am not sure whether the end result gives me the data with highest values all the time or not.
So, How can it be programmatically (in python, preferably) done?
Here are 2 solutions one in Python and another in Nodejs without using third party libraries:
Python:
import re
import json
with open('data.txt', 'r') as file:
lines = file.read()
lines = lines.split('\n')
fruit = {}
for line in lines:
key, value = re.split(r'\s{4}', line)
if (key not in fruit or int(fruit[key]) < int(value)):
fruit[key] = value
fruit = json.dumps(fruit)
fruit = re.sub(r'["{}:]', '', fruit)
fruit = re.sub(r', ', '\n', fruit)
with open('fruits.txt', 'w') as file:
file.write(fruit)
Nodejs:
import fs from 'fs'
const file = fs.readFileSync('data.txt', 'utf8');
const lines = file.split('\n');
let fruit = {}
for (const line of lines) {
const [key, value] = line.split(/\s{4}/)
!fruit[key] || +fruit[key] < +value ? fruit[key] = value : null
}
fruit = JSON.stringify(fruit)
.replace(/["{}]/g, '')
.replace(/:/g, ' ')
.replace(/,/g, '\n')
fs.writeFileSync('fruits.txt', fruit)
The intuitive way is to use dictionaries:
f = open('test.txt', 'r')
lines = f.readlines()
my_dict = {};
for line in lines:
s_line = line.split()
if s_line[0] in my_dict and my_dict[s_line[0]] < int(s_line[1]):
my_dict[s_line[0]] = int(s_line[1])
else:
my_dict[s_line[0]] = int(s_line[1])
new_f = open('test_no_duplicates.txt', 'w')
for key in my_dict:
new_f.write(key + " " + str(my_dict[key]) + "\n")
f.close()
new_f.close()
That would probably work
from collections import defaultdict
filename1 = ""
filename2 = ""
words = defaultdict(int)
with open(filename1) as f1:
for line in f1:
word, value = line.strip().split()
if int(value) > words[word]:
words[word] = int(value)
with open(filename2, "w") as f2:
for line in words.items():
f2.write(line)
If you have pandas data frame then:
import pandas
df = pandas.read_csv(filepath)
result = df.groupby('Name').agg({'values': 'max'})
print(result)
from pathlib import Path
import pandas as pd
import numpy as np
textFile = Path("./sample1.txt")
text = textFile.read_text()
rows = text.split("\n")
entries = [x.split(" ") for x in rows]
data = {
"Fruits": [x[0] for x in entries],
"Values": [x[1] for x in entries]
}
df = pd.DataFrame(data)
new_df = df.groupby(["Fruits"]).max()
new_df.reset_index(inplace=True)
np.savetxt("./out.txt", new_df.values, fmt='%s')
Example:
sample1.txt
Apples 51824
Oranges 131236
Peaches 6564
Apples 5879
Peaches 69878
out.txt
Apples 5879
Oranges 131236
Peaches 69878
Here's a quick solution in just a few lines, and outputs a nice and flat CSV file.
Code:
import pandas as pd
with open('apples.txt') as f:
text = [[i, int(j)] for i, j in [i.strip().split() for i in f.readlines()]]
(pd.DataFrame(text, columns=['fruit', 'count'])
.groupby('fruit')
.agg({'count': 'max'})
.reset_index()
.to_csv('apples_out.txt', index=False))
Output:
fruit,count
Apples,51824
Oranges,131236
Peaches,69878
Use dictionary to remember best value/line pair for each fruit:
results = {}
with open('file.txt') as f:
for line in f:
fruit, value = line.split()
value = int(value)
if fruit not in results or results[fruit][0] < value:
results[fruit] = (value, line.strip())
print('\n'.join(item[1] for item in results.values()))

Is it possible to save the print-output in a dict with python?

I want to know, if it's possible to save the output of this code into a dictionary (maybe it's also the wrong data-type). I'm not expirienced in coding yet, so I can't think of a way it could work.
I want to create a dicitionary that has the lines of the txt.-file in it alongside the value of the corresponding line. In the end, I want to create a code, where the user has the option to search for a word in the line through an input - the output should return the corresponding line. Has anyone a suggestion? Thanks in advance! Cheers!
filepath = 'myfile.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
print("Line {}: {}".format(cnt, line.strip()))
line = fp.readline()
cnt += 1
This should do it (using the code you provided as a framework, it only takes one extra line to store it in a dictionary):
my_dict={}
filepath = 'myfile.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
# print("Line {}: {}".format(cnt, line.strip()))
my_dict[str(line.strip())] = cnt
line = fp.readline()
cnt += 1
Then, you can prompt for user input like this:
usr_in = input('enter text to search: ')
print('That text is found at line(s) {}'.format(
[v for k,v in my_dict.items() if usr_in in k]))
For storing the line string value as key in dictionary and line number as value, you can try something like:
filepath = 'myfile.txt'
result_dict = {}
with open(filepath) as fp:
for line_num, line in enumerate(fp.readlines()):
result_dict[line.strip()] = line_num+1
Or, using dictionary comprehension, above code can be:
filepath = 'myfile.txt'
with open(filepath) as fp:
result_dict = {line.strip(): line_num+1
for line_num, line in enumerate(fp.readlines())}
Now to search and return all the lines with words:
search_result = [{key: value} for key, value in result_dict.items()
if search_word in key]

Python extract values from text using keys

I have a text file in the following format of Key Value
--START--
FirstName Kitty
LastName McCat
Color Red
random_data
Meow Meow
--END--
I'm wanting to extract specific values from the text into a variable or a dict. For example if I want to extract the values of LastName and Color what would be the best way to do this?
The random_data may be anywhere in the file and span multiple lines.
I've considered using regex but am concerned with performance and readability as in the real code I have many different keys to extract.
I could also loop over each line and check for each key but it's quite messy when having 10+ keys. For example:
if line.startswith("LastName"):
#split line at space and handle
if line.startswith("Color"):
#split line at space and handle
Hoping for something a little cleaner
tokens = ['LastName', 'Color']
dictResult = {}
with open(fileName,'r') as fileHandle:
for line in fileHandle:
lineParts = line.split(" ")
if len(lineParts) == 2 and lineParts[0] in tokens:
dictResult[lineParts[0]] = lineParts[1]
Assuming your file is in something called sampletxt.txt, this would work. It creates a dictionary mapping from key -> list of values.
import re
with open('sampletxt.txt', 'r') as f:
txt = f.read()
keys = ['FirstName', 'LastName', 'Color']
d = {}
for key in keys:
d[key] = re.findall(key+r'\s(.*)\s*\n*', txt)
This version allows you to optionally specify the tokens
import re
​
s = """--START--
FirstName Kitty
LastName McCat
Color Red
random_data
Meow Meow
--END--"""
tokens = ["LastName", "Color"]
if len(tokens) == 0:
print(re.findall("({0}) ({0})".format("\w+"), s))
else:
print( list((t, re.findall("{} (\w+)".format(t), s)[0]) for t in tokens))
Output
[('LastName', 'McCat'), ('Color', 'Red')]
Building off the other answers, this function would use regular expressions to take any text key and return the value if found:
import re
file_name = 'test.txt'
def get_text_value(text_key, file_name):
match_str = text_key + "\s(\w+)\n"
with open(file_name, "r") as f:
text_to_check = f.readlines()
text_value = None
for line in text_to_check:
matched = re.match(match_str, line)
if matched:
text_value = matched.group(1)
return text_value
if __name__ == "__main__":
first_key = "FirstName"
first_value = get_text_value(first_key, file_name)
print('Check for first key "{}" and value "{}"'.format(first_key,
first_value))
second_key = "Color"
second_value = get_text_value(second_key, file_name)
print('Check for first key "{}" and value "{}"'.format(second_key,
second_value))

Python: Add a new line after the first word in a sentence if the first word is all caps

I'm trying to modify a txt file. The file is a movie script in the format:
BEN We’ve discussed this before.
LUKE I can be a Jedi. I’m ready.
I'd like insert a new line after the character:
BEN
We’ve discussed this before.
LUKE
I can be a Jedi. I’m ready.
How do I do this in python? I currently have:
def modify_file(file_name):
fh=fileinput.input(file_name,inplace=True)
for line in fh:
split_line = line.split()
if(len(split_line)>0):
first_word = split_line[0]
replacement = first_word+'\n'
first_word=first_word.replace(first_word,replacement)
sys.stdout.write(first_word)
fh.close()
As suggested in one of the comments, this can be done using split and isupper. An example is provided below:
source_path = 'source_path.txt'
f = open(source_path)
lines = f.readlines()
f.close()
temp = ''
for line in lines:
words = line.split(' ')
if words[0].isupper():
temp += words[0] + '\n' + ' '.join(words[1:])
else:
temp += line
f = open(source_path, 'w')
f.write(temp)
f.close()
There are multiple problems with your code.
import fileinput
def modify_file(file_name):
fh=fileinput.input("output.txt",inplace=True)
for line in fh:
split_line = line.split()
if(len(split_line)>0):
x=split_line[0]+"\n"+" ".join(split_line[1:])+"\n"
sys.stdout.write(x)
fh.close() #==>this cannot be in the if loop.It has to be at the outer for level

Group and Check-mark using Python

I have several files, each of which has data like this (filename:data inside separated by newline):
Mike: Plane\nCar
Paula: Plane\nTrain\nBoat\nCar
Bill: Boat\nTrain
Scott: Car
How can I create a csv file using python that groups all the different vehicles and then puts a X on the applicable person, like:
Assuming those line numbers aren't in there (easy enough to fix if they are), and with an input file like following:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
Solution can be found here : https://gist.github.com/999481
import sys
from collections import defaultdict
import csv
# see http://stackoverflow.com/questions/6180609/group-and-check-mark-using-python
def main():
# files = ["group.txt"]
files = sys.argv[1:]
if len(files) < 1:
print "usage: ./python_checkmark.py file1 [file2 ... filen]"
name_map = defaultdict(set)
for f in files:
file_handle = open(f, "r")
process_file(file_handle, name_map)
file_handle.close()
print_csv(sys.stdout, name_map)
def process_file(input_file, name_map):
cur_name = ""
for line in input_file:
if ":" in line:
cur_name, item = [x.strip() for x in line.split(":")]
else:
item = line.strip()
name_map[cur_name].add(item)
def print_csv(output_file, name_map):
names = name_map.keys()
items = set([])
for item_set in name_map.values():
items = items.union(item_set)
writer = csv.writer(output_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow( [""] + names )
for item in sorted(items):
row_contents = map(lambda name:"X" if item in name_map[name] else "", names)
row = [item] + row_contents
writer.writerow( row )
if __name__ == '__main__':
main()
Output:
,Mike,Bill,Scott,Paula
Boat,,X,,X
Car,X,,X,X
Plane,X,,,X
Train,,X,,X
Only thing this script doesn't do is keep the columns in order that the names are in. Could keep a separate list maintaining the order, since maps/dicts are inherently unordered.
Here is an example of how-to parse these kind of files.
Note that the dictionary is unordered here. You can use ordered dict (in case of Python 3.2 / 2.7) from standard library, find any available implmentation / backport in case if you have older Python versions or just save an order in additional list :)
data = {}
name = None
with open(file_path) as f:
for line in f:
if ':' in line: # we have a name here
name, first_vehicle = line.split(':')
data[name] = set([first_vehicle, ]) # a set of vehicles per name
else:
if name:
data[name].add(line)
# now a dictionary with names/vehicles is available
# let's convert it to simple csv-formatted string..
# a set of all available vehicles
vehicles = set(v for vlist in data.values()
for v in vlist)
for name in data:
name_vehicles = data[name]
csv_vehicles = ''
for v in vehicles:
if v in name_vehicles:
csv_vehicles += v
csv_vehicles += ','
csv_line = name + ',' + csv_vehicles
Assuming that the input looks like this:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
This python script, places the vehicles in a dictionary, indexed by the person:
#!/usr/bin/python
persons={}
vehicles=set()
with open('input') as fd:
for line in fd:
line = line.strip()
if ':' in line:
tmp = line.split(':')
p = tmp[0].strip()
v = tmp[1].strip()
persons[p]=[v]
vehicles.add(v)
else:
persons[p].append(line)
vehicles.add(line)
for k,v in persons.iteritems():
print k,v
print 'vehicles', vehicles
Result:
Mike ['Plane', 'Car']
Bill ['Boat', 'Train']
Scott ['Car']
Paula ['Plane', 'Train', 'Boat', 'Car']
vehicles set(['Train', 'Car', 'Plane', 'Boat'])
Now, all the data needed are placed in data-structures. The csv-part is left as an exercise for the reader :-)
The most elegant and simple way would be like so:
vehiclesToPeople = {}
people = []
for root,dirs,files in os.walk('/path/to/folder/with/files'):
for file in files:
person = file
people += [person]
path = os.path.join(root, file)
with open(path) as f:
for vehicle in f:
vehiclesToPeople.setdefault(vehicle,set()).add(person)
people.sort()
table = [ ['']+people ]
for vehicle,owners in peopleToVehicles.items():
table.append([('X' if p in vehiclesToPeople[vehicle] else '') for p in people])
csv = '\n'.join(','.join(row) for row in table)
You can do pprint.pprint(table) as well to look at it.

Categories