How to remove Duplicates in .txt file - python

I have a .txt file with the below entries:-
Apples 51824
Oranges 131236
Peaches 6564
Apples 5879
Peaches 69878
I am trying to remove the entire row (when duplicate entries are found) from this file whenever a word (say Apples) matches in that row (keeping in mind that the entry with the highest value stays).
What I presently do:-
1. Open the file in Excel.
2. Go to Data --> Remove Duplicates
The issue with this approach according to me is that I am not sure whether the end result gives me the data with highest values all the time or not.
So, How can it be programmatically (in python, preferably) done?

Here are 2 solutions one in Python and another in Nodejs without using third party libraries:
Python:
import re
import json
with open('data.txt', 'r') as file:
lines = file.read()
lines = lines.split('\n')
fruit = {}
for line in lines:
key, value = re.split(r'\s{4}', line)
if (key not in fruit or int(fruit[key]) < int(value)):
fruit[key] = value
fruit = json.dumps(fruit)
fruit = re.sub(r'["{}:]', '', fruit)
fruit = re.sub(r', ', '\n', fruit)
with open('fruits.txt', 'w') as file:
file.write(fruit)
Nodejs:
import fs from 'fs'
const file = fs.readFileSync('data.txt', 'utf8');
const lines = file.split('\n');
let fruit = {}
for (const line of lines) {
const [key, value] = line.split(/\s{4}/)
!fruit[key] || +fruit[key] < +value ? fruit[key] = value : null
}
fruit = JSON.stringify(fruit)
.replace(/["{}]/g, '')
.replace(/:/g, ' ')
.replace(/,/g, '\n')
fs.writeFileSync('fruits.txt', fruit)

The intuitive way is to use dictionaries:
f = open('test.txt', 'r')
lines = f.readlines()
my_dict = {};
for line in lines:
s_line = line.split()
if s_line[0] in my_dict and my_dict[s_line[0]] < int(s_line[1]):
my_dict[s_line[0]] = int(s_line[1])
else:
my_dict[s_line[0]] = int(s_line[1])
new_f = open('test_no_duplicates.txt', 'w')
for key in my_dict:
new_f.write(key + " " + str(my_dict[key]) + "\n")
f.close()
new_f.close()

That would probably work
from collections import defaultdict
filename1 = ""
filename2 = ""
words = defaultdict(int)
with open(filename1) as f1:
for line in f1:
word, value = line.strip().split()
if int(value) > words[word]:
words[word] = int(value)
with open(filename2, "w") as f2:
for line in words.items():
f2.write(line)
If you have pandas data frame then:
import pandas
df = pandas.read_csv(filepath)
result = df.groupby('Name').agg({'values': 'max'})
print(result)

from pathlib import Path
import pandas as pd
import numpy as np
textFile = Path("./sample1.txt")
text = textFile.read_text()
rows = text.split("\n")
entries = [x.split(" ") for x in rows]
data = {
"Fruits": [x[0] for x in entries],
"Values": [x[1] for x in entries]
}
df = pd.DataFrame(data)
new_df = df.groupby(["Fruits"]).max()
new_df.reset_index(inplace=True)
np.savetxt("./out.txt", new_df.values, fmt='%s')
Example:
sample1.txt
Apples 51824
Oranges 131236
Peaches 6564
Apples 5879
Peaches 69878
out.txt
Apples 5879
Oranges 131236
Peaches 69878

Here's a quick solution in just a few lines, and outputs a nice and flat CSV file.
Code:
import pandas as pd
with open('apples.txt') as f:
text = [[i, int(j)] for i, j in [i.strip().split() for i in f.readlines()]]
(pd.DataFrame(text, columns=['fruit', 'count'])
.groupby('fruit')
.agg({'count': 'max'})
.reset_index()
.to_csv('apples_out.txt', index=False))
Output:
fruit,count
Apples,51824
Oranges,131236
Peaches,69878

Use dictionary to remember best value/line pair for each fruit:
results = {}
with open('file.txt') as f:
for line in f:
fruit, value = line.split()
value = int(value)
if fruit not in results or results[fruit][0] < value:
results[fruit] = (value, line.strip())
print('\n'.join(item[1] for item in results.values()))

Related

How do I parse this kind of text file with special separator

I need to parse the following text file into a dataframe, any suggestion about the methods?
Input:
('name: ', u'Jacky')
('male: ', True)
('hobby: ', u'play football and bascket')
('age: ', 24.0)
----------------
('name: ', u'Belly')
('male: ', True)
('hobby: ', u'dancer')
('age: ', 74.0)
----------------
('name: ', u'Chow')
('male: ', True)
('hobby: ', u'artist')
('age: ', 46.0)
output:
name male hobby age
jacky True football 24
...
I used regex to parse your text file :
import re
import pandas as pd
text_path = 'text.txt'
my_dict = {}
pattern = r"\('(\w+):\s+',\s+u*'*([a-zA-Z0-9\s.]*)'*\)"
with open(text_path, 'r') as txt:
for block in re.split(r"-+\n", txt.read()):
for line in filter(None, block.split('\n')):
col_name, value = re.search(pattern, line).group(1,2)
try:
value = int(float(value))
except ValueError:
value = True if value=='True' else False if value=='False' else value
if col_name in my_dict:
my_dict[col_name].append(value)
else:
my_dict[col_name] = [value]
df = pd.DataFrame(my_dict)
print(df)
Output :
name male hobby age
0 Jacky True play football and bascket 24
1 Belly True dancer 74
2 Chow True artist 46
Booleans values are not string but real bool True or False, numerical value (like age) are int (you could keep them as float) and not strings.
Ask me if you don't understand something.
I don't know any way to parse this data convention with usage of some existing parser so I suggest to build your own ones. Then I would use readlines() method on open file so it allows me to iterate over lines of data and apply correct parser to each row in iteration. Finally, I would combine data and create DataFrame. Example code is below:
import pandas as pd
import sys
def parse_from_weird_file_to_pandas_df(file):
with open(file, 'r') as f:
content = f.readlines()
name_vals = [_parse_text(content[line]) for line in range(0, len(content), 5)]
male_vals = [_parse_bool(content[line]) for line in range(1, len(content), 5)]
hobby_vals = [_parse_text(content[line]) for line in range(2, len(content), 5)]
age_vals = [_parse_int(content[line]) for line in range(3, len(content), 5)]
df_rows = zip(name_vals, male_vals, hobby_vals, age_vals)
df = pd.DataFrame(data=df_rows, columns=["name", "male", "hobby", "age"])
return df
def _parse_text(text_line):
text = text_line[text_line.find("u'") + 2: text_line.find("')")]
return text
def _parse_bool(bool_line):
val_bool = bool_line[bool_line.find("', ") + 3: bool_line.find(")")]
return True if val_bool == "True" else False
def _parse_int(int_line):
val_int = int_line[int_line.find("', ") + 3: int_line.find(")")]
return int(float(val_int))
If you wish to shorten 'play football and bascket' to just 'football' you can achieve this for example by creating list with all available hobbies, looping them through parsed hobby and returning the matching one.
Here is a quick code I made just before lunch, not optimised but seems to work (I did not remove the 'u'in the string and did not convert the int but you should be able to manage that ? If not let me kow and i will work on it after !
The .join remove unecessary char and I assume you only have 4 object every time...
file = open("yourfile.txt", 'r')
lines = file.readlines()
init = True
list_to_append = []
df = pd.DataFrame(columns=['name', 'male', 'hobby','age'])
for line in lines:
if '---' not in line:
line = line.split(',')[1]
processed_line = ''.join(c for c in line if c not in " ()'\n")
list_to_append.append(processed_line)
if len(list_to_append) == 4:
df.loc[len(df)] = list_to_append
list_to_append = []
else :
pass
file.close()

how to check if item in a list is exist in other list with python?

I have a text file containing these lines for example
[:];#;;]wqwww actualnumber 1234 ;;:###
aaaa ''3# allnumber 9876
///qqq |||)))
]][]: best 56
I want to get the value 1234,9876,56. Like this(desired output)
1234
9876
56
I tried with the following script but it did not print out anything
with open("test.txt", "r") as f:
lines = f.readlines()
stripped_lines = [line.strip() for line in lines]
word = ["actual number", "potential", "time"]
if any(item in stripped_lines for item in word):
aa = stripped_lines.split("actualnumber ")[1].split(" ")[0]
bb = stripped_lines.split("allnumber ")[1].split(" ")[1]
cc = stripped_lines.split("best ")[1]
print aa
print bb
print cc
Did I miss something?
you can do it with re module
import re
with open('f.txt') as f:
data = f.read()
act = re.findall(r'actualnumber\s+(\d+)',data)
best = re.findall(r'best\s(\d+)',data)
allnumber = re.findall(r'allnumber\s(\d+)',data)
print "actualnumber : ", act[0] if act else None
print "allnumber : ", allnumber[0] if allnumber else None
print "best : ", best[0] if best else None
output
actualnumber : 1234
allnumber : 9876
best : 56
The simple way is by using isdigit()
f = open('test.txt')
data = f.read()
[int(s) for s in data.split() if s.lstrip("-").isdigit()]
Output :
[1234, 9876, 56]
word = ["actualnumber", "potential", "time"]
with open("./abx.txt") as file:
temp_list = [line.strip() for line in file.readlines()]
list_val = [item for item in temp_list if any(x in item for x in word)]
final_list = []
match_value = ["actualnumber", "allnumber", "best"]
if list_val:
for val in temp_list:
val_list = val.split(" ")
for key, value in enumerate(val_list):
if value in match_value:
final_list.append(val_list[key+1])
print(final_list):
And at the end you can iterate list and find out your expected values.
You can extract numbers like this:
k = '[:];#;;]wqwww actualnumber 1234 ;;:###'
for item in k.split():
if item.isnumeric():
print(item)
You would use your striped lines instead. For each line in stripped line, split it, and check if any item in the split line is numeric. isnumeric() works only on unicode objects.
with open("test.txt", "r") as f:
lines = f.readlines()
stripped_lines = [line.strip() for line in lines]
words = ['actualnumber', 'allnumber', 'best']
found = {}
for line in stripped_lines:
current_line = line.split()
for position, item in enumerate(current_line):
if item in words:
found[item] = current_line[position + 1]
Now that you have them in a dictionary, you can access them as: found['actualnumber']. And do further processing, such as storing them in a database.

Python extract values from text using keys

I have a text file in the following format of Key Value
--START--
FirstName Kitty
LastName McCat
Color Red
random_data
Meow Meow
--END--
I'm wanting to extract specific values from the text into a variable or a dict. For example if I want to extract the values of LastName and Color what would be the best way to do this?
The random_data may be anywhere in the file and span multiple lines.
I've considered using regex but am concerned with performance and readability as in the real code I have many different keys to extract.
I could also loop over each line and check for each key but it's quite messy when having 10+ keys. For example:
if line.startswith("LastName"):
#split line at space and handle
if line.startswith("Color"):
#split line at space and handle
Hoping for something a little cleaner
tokens = ['LastName', 'Color']
dictResult = {}
with open(fileName,'r') as fileHandle:
for line in fileHandle:
lineParts = line.split(" ")
if len(lineParts) == 2 and lineParts[0] in tokens:
dictResult[lineParts[0]] = lineParts[1]
Assuming your file is in something called sampletxt.txt, this would work. It creates a dictionary mapping from key -> list of values.
import re
with open('sampletxt.txt', 'r') as f:
txt = f.read()
keys = ['FirstName', 'LastName', 'Color']
d = {}
for key in keys:
d[key] = re.findall(key+r'\s(.*)\s*\n*', txt)
This version allows you to optionally specify the tokens
import re
​
s = """--START--
FirstName Kitty
LastName McCat
Color Red
random_data
Meow Meow
--END--"""
tokens = ["LastName", "Color"]
if len(tokens) == 0:
print(re.findall("({0}) ({0})".format("\w+"), s))
else:
print( list((t, re.findall("{} (\w+)".format(t), s)[0]) for t in tokens))
Output
[('LastName', 'McCat'), ('Color', 'Red')]
Building off the other answers, this function would use regular expressions to take any text key and return the value if found:
import re
file_name = 'test.txt'
def get_text_value(text_key, file_name):
match_str = text_key + "\s(\w+)\n"
with open(file_name, "r") as f:
text_to_check = f.readlines()
text_value = None
for line in text_to_check:
matched = re.match(match_str, line)
if matched:
text_value = matched.group(1)
return text_value
if __name__ == "__main__":
first_key = "FirstName"
first_value = get_text_value(first_key, file_name)
print('Check for first key "{}" and value "{}"'.format(first_key,
first_value))
second_key = "Color"
second_value = get_text_value(second_key, file_name)
print('Check for first key "{}" and value "{}"'.format(second_key,
second_value))

python script that will filter data from file

I am writing a script for scrap data from file (any format like csv,text,json,html etc.) and match list with another file and then replace that particular string from another file , each file contain same data and i would like to use regular expression because i want to scrap data after %%string%% and then store string in to the list
format of file
file1.txt
{
"alias": "%%demo%%",
"demo": "%%demo%%",
"dns_domain": "googlr.com",
"max_physical_memory": "%%maxmemory%%",
"dataset_uuid": "%%DS_UUID%%",
"nics": [
{
"nic_tag": "stub0",
"ip": "%%ip%%",
"netmask": "255.255.240.0",
"primary": "1"
}
]
}
I want to get all of the string in to the list between %%____%% sign
Python Code
import sys
import re
list = []
list1 = []
i = 0
for n in sys.argv[1:]:
#list = []
#list1 = []
print n
input1 = open(n, "w")
#print input1
output = open(n,"r")
for line1 in output:
s = line1.split("=",1)[1:2]
for m in s:
list1.append(m.strip())
for line in input1:
a = re.findall(r"%%([^%^\n]+)%%", line)
for val in a:
list.append(val)
stext = list[i:0]
rtext = list1[i:0]
input1.write(line.replace(val, rtext))
i += 1
input1.close()
output.close()
print list and list2 , list2 having values from file2.txt
file2.txt
demo=somehost
demo=somehost2
maxmemory=1025
DS_UUID = 454s5da5d4a
ip=127.0.0.1
i want to replace in file1 from file2 , please check my code and let me know how can we do it
It's easy to find data inside well-known markers using regular expressions:
>>> import re
>>> re.findall(r"%%([^%^\n]+)%%", "hello %%there%% how\n are %%you%%")
['there', 'you']
From your updated example, you can extend the list instead of adding sublists
import fileinput
import re
array = []
for line in fileinput.input():
array.extend(re.findall(r"%%([^%^\n]+)%%", line))
print array
fileinput.close()
Thanks to all for your time, finally i achive what i want and my code is below
import sys
import re
list2 = []
file1 = 'file1.json'
file2 = 'test-var.txt'
output = open(file2, "r")
for line1 in output:
s = line1.split("=",1)[1:2]
for m in s:
list2.append(m)
input1 = open(file1, "r")
list1 = []
txt = ''
for line in input1:
a = re.findall(r"%%([^%^\n]+)%%",line)
a = ''.join(a)
if a =='':
txt = txt + line
continue
if any(a in s for s in list1):
val = '%%'+a+"1"+'%%'
line = line.replace('%%'+a+'%%', val)
a = a + "1"
txt = txt + line
list1.append(a)
for i in range(len(list1)):
string1 = '%%'+''.join(list1[i])+'%%'
string2 = ''.join(list2[i])
txt = txt.replace(string1,string2)
input1.close
output.close()
output = open(file1, "w")
print txt
output.write(txt)
output.close()

Group and Check-mark using Python

I have several files, each of which has data like this (filename:data inside separated by newline):
Mike: Plane\nCar
Paula: Plane\nTrain\nBoat\nCar
Bill: Boat\nTrain
Scott: Car
How can I create a csv file using python that groups all the different vehicles and then puts a X on the applicable person, like:
Assuming those line numbers aren't in there (easy enough to fix if they are), and with an input file like following:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
Solution can be found here : https://gist.github.com/999481
import sys
from collections import defaultdict
import csv
# see http://stackoverflow.com/questions/6180609/group-and-check-mark-using-python
def main():
# files = ["group.txt"]
files = sys.argv[1:]
if len(files) < 1:
print "usage: ./python_checkmark.py file1 [file2 ... filen]"
name_map = defaultdict(set)
for f in files:
file_handle = open(f, "r")
process_file(file_handle, name_map)
file_handle.close()
print_csv(sys.stdout, name_map)
def process_file(input_file, name_map):
cur_name = ""
for line in input_file:
if ":" in line:
cur_name, item = [x.strip() for x in line.split(":")]
else:
item = line.strip()
name_map[cur_name].add(item)
def print_csv(output_file, name_map):
names = name_map.keys()
items = set([])
for item_set in name_map.values():
items = items.union(item_set)
writer = csv.writer(output_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow( [""] + names )
for item in sorted(items):
row_contents = map(lambda name:"X" if item in name_map[name] else "", names)
row = [item] + row_contents
writer.writerow( row )
if __name__ == '__main__':
main()
Output:
,Mike,Bill,Scott,Paula
Boat,,X,,X
Car,X,,X,X
Plane,X,,,X
Train,,X,,X
Only thing this script doesn't do is keep the columns in order that the names are in. Could keep a separate list maintaining the order, since maps/dicts are inherently unordered.
Here is an example of how-to parse these kind of files.
Note that the dictionary is unordered here. You can use ordered dict (in case of Python 3.2 / 2.7) from standard library, find any available implmentation / backport in case if you have older Python versions or just save an order in additional list :)
data = {}
name = None
with open(file_path) as f:
for line in f:
if ':' in line: # we have a name here
name, first_vehicle = line.split(':')
data[name] = set([first_vehicle, ]) # a set of vehicles per name
else:
if name:
data[name].add(line)
# now a dictionary with names/vehicles is available
# let's convert it to simple csv-formatted string..
# a set of all available vehicles
vehicles = set(v for vlist in data.values()
for v in vlist)
for name in data:
name_vehicles = data[name]
csv_vehicles = ''
for v in vehicles:
if v in name_vehicles:
csv_vehicles += v
csv_vehicles += ','
csv_line = name + ',' + csv_vehicles
Assuming that the input looks like this:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
This python script, places the vehicles in a dictionary, indexed by the person:
#!/usr/bin/python
persons={}
vehicles=set()
with open('input') as fd:
for line in fd:
line = line.strip()
if ':' in line:
tmp = line.split(':')
p = tmp[0].strip()
v = tmp[1].strip()
persons[p]=[v]
vehicles.add(v)
else:
persons[p].append(line)
vehicles.add(line)
for k,v in persons.iteritems():
print k,v
print 'vehicles', vehicles
Result:
Mike ['Plane', 'Car']
Bill ['Boat', 'Train']
Scott ['Car']
Paula ['Plane', 'Train', 'Boat', 'Car']
vehicles set(['Train', 'Car', 'Plane', 'Boat'])
Now, all the data needed are placed in data-structures. The csv-part is left as an exercise for the reader :-)
The most elegant and simple way would be like so:
vehiclesToPeople = {}
people = []
for root,dirs,files in os.walk('/path/to/folder/with/files'):
for file in files:
person = file
people += [person]
path = os.path.join(root, file)
with open(path) as f:
for vehicle in f:
vehiclesToPeople.setdefault(vehicle,set()).add(person)
people.sort()
table = [ ['']+people ]
for vehicle,owners in peopleToVehicles.items():
table.append([('X' if p in vehiclesToPeople[vehicle] else '') for p in people])
csv = '\n'.join(','.join(row) for row in table)
You can do pprint.pprint(table) as well to look at it.

Categories