looping over large csv python - python

I have a large csv-file(several hundreds of lines) containing following structure:
filename, sitename, servername
this csv-file contains several doubles, since the servernames are those from a cluster(always the same couples) and language-aliases for the sitenames(eg. mijnhuis, myhouse, mamaison)
the result I'd like to have is the following(notice the ; and the , :
filename; sitename1, sitename2, sitename3; servername1, servername2, servername3;
How could I do this in the most optimized way?
PS: actual code would be nice, but if you can give me directions, I'd be equally happy.

Use at your own risk:
import collections as c
f = open('input', 'r')
sites, servers = c.defaultdict(set), c.defaultdict(set)
files = set()
for line in f:
parts = line.split(',')
fi,site,server = [p.strip() for p in parts]
files.add(fi)
sites[fi].add(site)
servers[fi].add(server)
for f in files:
print "%s; %s; %s" % (f, ", ".join(sites[f]), ", ".join(servers[f]))

Your data structure depends on your program. Perhaps you could also store
a dictionary of filenames to a list of tuples (sitename1, servname2) THere are tons of ways to accomplish what you are tryign to do
You could do something like:
import csv
file_names_to_data = {}
with open('your_csv') as f:
reader = csv.reader(f)
for line_list in reader:
try:
file_names_to_data[line_list[0]]['sitenames'].append(line_list[1])
file_names_to_data[line_list[0]]['servernames'].append(line_list[2])
except KeyError:
# initialize it
file_names_to_data[line_list[0]] = {'sitenames': [line_list[1]], 'servernames': [line_list[2]]}
ADditionally it looks like defaultdict could be extremely usedful.

Use a dictionary for filename and 2 lists of sitenames and servernames for each dictionary item

as it so happens, here included, a solution for my problem. exampleFile included.
CODE:
fp = r'PATH_TO_FILE'
aliases = []
aliases.append(("sitex","sitez","sitey"))
splitFile = []
for l in open(fp):
parts = tuple(l[:-1].replace(" ","").split(","))
splitFile.append(parts)
def isAlias(old, new):
print old, new
aliasFound = False
for alias in aliases:
if old in alias and new in alias:
aliasFound = True
return aliasFound
handledSites = []
for split in splitFile:
log = split[0]
site = split[1]
rp = split[2]
matchFound = False
for hs in handledSites:
if site in hs[0]:
matchFound = True
if rp not in hs[1]:
hs[1].append(rp)
if log not in hs[2]:
hs[2].append(log)
if not matchFound:
if isAlias(hs[0][0], site):
matchFound = True
hs[0].append(site)
if rp not in hs[1]:
hs[1].append(rp)
if log not in hs[2]:
hs[2].append(log)
if not matchFound:
handledSites.append(([site],[rp],[log]))
for s in handledSites:
print s
EXAMPLE FILE
logfile[date]_[server]_sitex.log, sitex, rp1
logfile[date]_[server]_sitex.log, sitex, rp2
logfile[date]_[server]_sitey.log, sitey, rp1
logfile[date]_[server]_sitey.log, sitey, rp2
logfile[date]_[server]_sitez.log, sitez, rp1
logfile[date]_[server]_sitez.log, sitez, rp2
logfile[date]_[server]_site3.log, site3, rp1
logfile[date]_[server]_site3.log, site3, rp2

Related

get wanted data from a text file with python without using splits

Hello i have a that file:
WORKERS = yovel:10.0.0.6,james:10.0.0.7
BLACKLIST = 92.122.197.45:ynet,95.1.2.2:twitter
I'm trying to write a function in python that will get the worker IP and returns the worker name like this:
workername = getName(ip)
The only method i thougt to do it is with splits(using .split(":") , .split(",") etc.) but it will be very long code and not smart.
is there a shorter way to do it?
You can use re:
import re
def getName(ip, content = open('filename.txt').read()):
_r = re.findall('\w+(?=:{})'.format(ip), content)
return _r[0] if _r else None
print(getName('10.0.0.6'))
Output:
'yovel'
Note, however, it is slightly more robust to use split:
def getName(ip):
lines = dict(i.strip('\n').split(' = ') for i in open('filename.txt')]
d = {b:a for a, b in map(lambda x:x.split(':'), lines['WORKERS'].split(','))}
return d.get(ip)
Using split() doesn't look too bad here:
def getName(ip_address, filename='file.txt', line_type='WORKERS'):
with open(filename) as in_file:
for line in in_file:
name, info = [x.strip() for x in line.strip().split('=')]
if name == line_type:
info = [x.split(':') for x in info.split(',')]
lookup = {ip: name for name, ip in info}
return lookup.get(ip_address)
Which works as follows:
>>> getName('10.0.0.6')
'yovel'

Speed up the write-to-different-files process

I am reading from a huge file (232MB) line by line.
First, i recognize each line according to a Regular Expression.
Then for each line, I am writing to different city.txt files under the 'report' directory according to a cityname in each line. However, this process takes a while. I am wondering if there is anyway of speeding up the process?
Example of input file: (each column split by a \t)
2015-02-03 19:20 Sane Diebgo Music 692.08 Cash
Actually i have tested the code with writing to different files and not writing to different file(simply process the large file and come up with 2 dicts) the time difference is huge. 80% of the time is spent writing to different files
def processFile(file):
pattern = re.compile(r"(\d{4}-\d{2}-\d{2})\t(\d{2}:\d{2})\t(.+)\t(.+)\t(\d+\.\d+|\d+)\t(\w+)\n")
f = open(file)
total_sale = 0
city_dict = dict()
categories_dict = dict()
os.makedirs("report", exist_ok = True)
for line in f:
valid_entry = pattern.search(line)
if valid_entry == None:
print("Invalid entry: '{}'".format(line.strip()))
continue
else:
entry_sale = float(valid_entry.group(5))
total_sale += entry_sale
city_dict.update({valid_entry.group(3) : city_dict.get(valid_entry.group(3), 0) + entry_sale})
categories_dict.update({valid_entry.group(4) : categories_dict.get(valid_entry.group(4), 0) + entry_sale})
filename = "report/" + valid_entry.group(3) + ".txt"
if os.path.exists(filename):
city_file = open(filename, "a")
city_file.write(valid_entry.group(0))
city_file.close()
else:
city_file = open(filename, "w")
city_file.write(valid_entry.group(0))
city_file.close()
f.close()
return (city_dict, categories_dict, total_sale)
The dictionary lookups and updates could be improved by using defaultdict:
from collections import defaultdict
city_dict = defaultdict(float)
categories_dict = defaultdict(float)
...
city = valid_entry.group(3)
category = valid_entry.group(4)
...
city_dict[city] += entry_sale
category_dict[category] += entry_sale

Array/List from txt file in Python

I was trying to get value from .txt file into array/list in python.
Let's say I have this data in user.txt :
ghost:001
ghost:002
ghost:003
So, when I want to output it as :
'ghost:001','ghost:002','ghost:003'
I use this function
def readFromFile(filename, use_csv):
userlist = ''
userlist_b = ''
print ("Fetching users from '%s'"% filename)
f = open (filename,"r")
for line in f:
userlist+=str(line)
userlist = "','".join(userlist.split("\n"))
userlist = "'" + userlist + "'"
userlist = "(%s)" %userlist
return userlist
My question is how could I do this:
I want to print specific user. Something like
idx = 2
print("User[%s] : %s",%idx, %(array[idx]))
*output:*
User[2] : ghost:003
How do I form the array?
Could anyone help me?
I would store the users in a dict where the keys increment for each user:
d = {}
with open("in.txt") as f:
user = 1
for line in f:
d[user]= line.rstrip()
user += 1
print(d)
{1: 'ghost:001', 2: 'ghost:002', 3: 'ghost:003'}
If you just want a list of user and to access by index:
with open("in.txt") as f:
users = f.readlines()
print("User {}".format(users[0]))
User ghost:001
Look into loading dictionaries. This code should help you.
import json
import pickle
d = { 'field1': 'value1', 'field2': 2, }
json.dump(d,open("testjson.txt","w"))
print json.load(open("testjson.txt","r"))
pickle.dump(d,open("testpickle.txt","w"))
print pickle.load(open("testpickle.txt","r"))
If you want the file (one big string) split out into smaller strings, don't build up a new string, then split it apart again. Just append each line to a list:
def readFromFile(filename, use_csv):
userlist = []
print ("Fetching users from '%s'"% filename)
with open(filename,"r") as f:
for line in f.read():
userlist.append(line)
return userlist
array = readFromFile('somefile', use_csv)
idx = 2
print("User[%s] : %s" % (idx, array[idx]))
Not sure about the User['idx'] part of you desire.
Try to use list comprehensions.
Use indexing rather than dictionaries if that's all you need. (I can add a dict version if the seconds part of the line is really the index you are looking up)
# read the file and use strip to remove trailing \n
User = [line.strip() for line in open(filename).readlines()]
# your output
print "User[2] : %s"%User[2]
# commented line is more clear
#print ','.join(User)
# but this use of repr adds the single quotes you showed
print ','.join(repr(user) for user in User)
output:
User[2] : ghost:003
'ghost:001','ghost:002','ghost:003'

Searching a file with the contents of another file python

I have a file that has a unique ID number on each line. I am trying to search a different file for the occurrences of these ID numbers and return the line where these id numbers are in the second file, in this case into an output file. I am new to programming and this is what I have so far.
outlist = []
with open('readID.txt', 'r') as readID, \
open('GOlines.txt', 'w') as output, \
open('GO.txt', 'r') as GO:
x = readID.readlines()
print x
for line in GO:
if x[1:-1] in line:
outlist.append(line)
outlist.append('\n')
if x[1:-1] in line:
outlist.append(line)
outlist.append('\n')
print outlist
output.writelines(outlist)
The files look like this: readID.txt
00073810.1
00082422.1
00018647.1
00063072.1
GO.txt
#query GO reference DB reference family
HumanDistalGut_READ_00048904.2 GO:0006412 TIGRFAM TIGR00001
HumanDistalGut_READ_00043244.3 GO:0022625 TIGRFAM TIGR00001
HumanDistalGut_READ_00048644.4 GO:0000315 TIGRFAM TIGR00001
HumanDistalGut_READ_00067264.5 GO:0003735 TIGRFAM TIGR00001
The read ids match up with some but not all of the ids after READ...
#!/usr/bin/env python
# encoding: utf-8
import sys
import re
def extract_id(line):
"""
input: HumanDistalGut_READ_00048904.2 GO:0006412 TIGRFAM TIGR00001
returns: 00048904.2
"""
result = re.search(r'READ_(\d{8}\.\d)', line)
if result != None:
return result.group(1)
else:
return None
def extract_go_num(line):
"""
input: HumanDistalGut_READ_00048904.2 GO:0006412 TIGRFAM TIGR00001
returns: 0006412
"""
result = re.search(r'GO:(\d{7})', line)
if result != None:
return result.group(1)
else:
return None
def main(argv = None):
if argv is None:
argv = sys.argv
with open('readID.txt', 'r') as f:
ids = frozenset(f.readlines())
with open('GO.txt', 'r') as haystack, \
open('GOLines.txt', 'w') as output:
for line in haystack:
if extract_id(line) in ids:
output.write(extract_go_num(line) + '\n')
if __name__ == "__main__":
sys.exit(main())
I'm trading memory overhead for an O(n) solution rather than O(n^2).
I'm using regular expressions to extract the ids and go numbers, but it's brittle if the number of digits change.
Maybe something like this:
with open('readID.txt', 'r') as readID, open('GOlines.txt', 'w') as output, open('GO.txt', 'r') as GO:
for ID in readID:
for line in GO:
if ID in line:
output.write(line)
If your files are small enough to fit in your memory.
with open('/somepath/GO.txt') as f:
pool = f.readlines()
with open('/somepath/readID.txt') as f:
tokens = f.readlines()
# strip spaces/new lines
tokens = [t.strip() for t in tokens]
found = [(t, lno) for t in tokens for (lno, l) in enumerate(pool) if t in l]
You could then print your found list into your outfile.

Group and Check-mark using Python

I have several files, each of which has data like this (filename:data inside separated by newline):
Mike: Plane\nCar
Paula: Plane\nTrain\nBoat\nCar
Bill: Boat\nTrain
Scott: Car
How can I create a csv file using python that groups all the different vehicles and then puts a X on the applicable person, like:
Assuming those line numbers aren't in there (easy enough to fix if they are), and with an input file like following:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
Solution can be found here : https://gist.github.com/999481
import sys
from collections import defaultdict
import csv
# see http://stackoverflow.com/questions/6180609/group-and-check-mark-using-python
def main():
# files = ["group.txt"]
files = sys.argv[1:]
if len(files) < 1:
print "usage: ./python_checkmark.py file1 [file2 ... filen]"
name_map = defaultdict(set)
for f in files:
file_handle = open(f, "r")
process_file(file_handle, name_map)
file_handle.close()
print_csv(sys.stdout, name_map)
def process_file(input_file, name_map):
cur_name = ""
for line in input_file:
if ":" in line:
cur_name, item = [x.strip() for x in line.split(":")]
else:
item = line.strip()
name_map[cur_name].add(item)
def print_csv(output_file, name_map):
names = name_map.keys()
items = set([])
for item_set in name_map.values():
items = items.union(item_set)
writer = csv.writer(output_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow( [""] + names )
for item in sorted(items):
row_contents = map(lambda name:"X" if item in name_map[name] else "", names)
row = [item] + row_contents
writer.writerow( row )
if __name__ == '__main__':
main()
Output:
,Mike,Bill,Scott,Paula
Boat,,X,,X
Car,X,,X,X
Plane,X,,,X
Train,,X,,X
Only thing this script doesn't do is keep the columns in order that the names are in. Could keep a separate list maintaining the order, since maps/dicts are inherently unordered.
Here is an example of how-to parse these kind of files.
Note that the dictionary is unordered here. You can use ordered dict (in case of Python 3.2 / 2.7) from standard library, find any available implmentation / backport in case if you have older Python versions or just save an order in additional list :)
data = {}
name = None
with open(file_path) as f:
for line in f:
if ':' in line: # we have a name here
name, first_vehicle = line.split(':')
data[name] = set([first_vehicle, ]) # a set of vehicles per name
else:
if name:
data[name].add(line)
# now a dictionary with names/vehicles is available
# let's convert it to simple csv-formatted string..
# a set of all available vehicles
vehicles = set(v for vlist in data.values()
for v in vlist)
for name in data:
name_vehicles = data[name]
csv_vehicles = ''
for v in vehicles:
if v in name_vehicles:
csv_vehicles += v
csv_vehicles += ','
csv_line = name + ',' + csv_vehicles
Assuming that the input looks like this:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
This python script, places the vehicles in a dictionary, indexed by the person:
#!/usr/bin/python
persons={}
vehicles=set()
with open('input') as fd:
for line in fd:
line = line.strip()
if ':' in line:
tmp = line.split(':')
p = tmp[0].strip()
v = tmp[1].strip()
persons[p]=[v]
vehicles.add(v)
else:
persons[p].append(line)
vehicles.add(line)
for k,v in persons.iteritems():
print k,v
print 'vehicles', vehicles
Result:
Mike ['Plane', 'Car']
Bill ['Boat', 'Train']
Scott ['Car']
Paula ['Plane', 'Train', 'Boat', 'Car']
vehicles set(['Train', 'Car', 'Plane', 'Boat'])
Now, all the data needed are placed in data-structures. The csv-part is left as an exercise for the reader :-)
The most elegant and simple way would be like so:
vehiclesToPeople = {}
people = []
for root,dirs,files in os.walk('/path/to/folder/with/files'):
for file in files:
person = file
people += [person]
path = os.path.join(root, file)
with open(path) as f:
for vehicle in f:
vehiclesToPeople.setdefault(vehicle,set()).add(person)
people.sort()
table = [ ['']+people ]
for vehicle,owners in peopleToVehicles.items():
table.append([('X' if p in vehiclesToPeople[vehicle] else '') for p in people])
csv = '\n'.join(','.join(row) for row in table)
You can do pprint.pprint(table) as well to look at it.

Categories