splitting dictionary and writing it to different csv file in python - python

I want to split the python dictionary and write it to different files based on NO_OF_LINES_PER_FILE and size of dictionary
Input
NO_OF_LINES_PER_FILE
so if NO_OF_LINES_PER_FILE = 2 and size of dictionary is 10 the i want the dictionary to be splitted into 5 files(each file will have 2 rows)
Script
import csv
NO_OF_LINES_PER_FILE = 2
s = {"2222":["1","2","3"],"3456":["2","3","4"],"5634":["4","5"],"23543":["456","3"],"29587":["4","5"],"244":["23","34"],"455":["3","4"],"244221":["5"],"23232345":["2323","43"]}
def again(c,h,NO_OF_LINES_PER_FILE1):
f3 = open('num_'+str(h)+'.csv', 'at')
if c == 1:
ceh = 2
else:
ceh = c
print ceh
v = 0
for w in s:
v = v + 1
if v < ceh:
pass
elif v > NO_OF_LINES_PER_FILE1:
print "yes"
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE1 + 1
h = NO_OF_LINES_PER_FILE1 + 1
again(c,h,NO_OF_LINES_PER_FILE1)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
def split():
f3 = open('has_'+str(NO_OF_LINES_PER_FILE)+'.csv', 'at')
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
c = 0
for w in s:
if c >= NO_OF_LINES_PER_FILE:
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE + 1
h = NO_OF_LINES_PER_FILE
again(c,h,NO_OF_LINES_PER_FILE1)
break
else:
#print NO_OF_LINES_PER_FILE
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
split()
But this script is not working and creates many files
In the above script NO_OF_LINES_PER_FILE = 2 and size of dictionary s is 9
so i want 5 files first four file will contain 2 rows each and fifth file will contain 1 row
How can i solve this problem?

my method is flat the dict first, then split flat dict to sub list with length you want
import csv
flatDict = [ i for i in s.items()]
splitFlatDict = [flatDict[i:i+NO_OF_LINES_PER_FILE] for i in xrange(0,len(flatDict),NO_OF_LINES_PER_FILE)]
for i,rows in enumerate(splitFlatDict):
with open(str(i) + '.csv','wb') as f:
writer = csv.writer(f)
writer.writerows(rows)

Related

Python: How to read space delimited data with different length in text file and parse it

I have space delimited data in a text file look like the following:
0 1 2 3
1 2 3
3 4 5 6
1 3 5
1
2 3 5
3 5
each line has different length.
I need to read it starting from line 2 ('1 2 3')
and parse it and get the following information:
Number of unique data = (1,2,3,4,5,6)=6
Count of each data:
count data (1)=3
count data (2)=2
count data (3)=5
count data (4)=1
count data (5)=4
count data (6)=1
Number of lines=6
Sort the data in descending order:
data (3)
data (5)
data (1)
data (2)
data (4)
data (6)
I did this:
file=open('data.txt')
csvreader=csv.reader(file)
header=[]
header=next(csvreader)
print(header)
rows=[]
for row in csvreader:
rows.append(row)
print(rows)
After this step, what should I do to get the expected results?
I would do something like this:
from collections import Counter
with open('data.txt', 'r') as file:
lines = file.readlines()
lines = lines[1:] # skip first line
data = []
for line in lines:
data += line.strip().split(" ")
counter = Counter(data)
print(f'unique data: {list(counter.keys())}')
print(f'count data: {list(sorted(counter.most_common(), key=lambda x: x[0]))}')
print(f'number of lines: {len(lines)}')
print(f'sort data: {[x[0] for x in counter.most_common()]}')
A simple brute force approach:
nums = []
counts = {}
for row in open('data.txt'):
if row[0] == '0':
continue
nums.extend( [int(k) for k in row.rstrip().split()] )
print(nums)
for n in nums:
if n not in counts:
counts[n] = 1
else:
counts[n] += 1
print(counts)
ordering = list(sorted(counts.items(), key=lambda k: -k[1]))
print(ordering)
Here is another approach
def getData(infile):
""" Read file lines and return lines 1 thru end"""
lnes = []
with open(infile, 'r') as data:
lnes = data.readlines()
return lnes[1:]
def parseData(ld):
""" Parse data and print desired results """
unique_symbols = set()
all_symbols = dict()
for l in ld:
symbols = l.strip().split()
for s in symbols:
unique_symbols.add(s)
cnt = all_symbols.pop(s, 0)
cnt += 1
all_symbols[s] = cnt
print(f'Number of Unique Symbols = {len(unique_symbols)}')
print(f'Number of Lines Processed = {len(ld)}')
for symb in unique_symbols:
print(f'Number of {symb} = {all_symbols[symb]}')
print(f"Descending Sort of Symbols = {', '.join(sorted(list(unique_symbols), reverse=True))}")
On executing:
infile = r'spaced_text.txt'
parseData(getData(infile))
Produces:
Number of Unique Symbols = 6
Number of Lines Processed = 6
Number of 2 = 2
Number of 5 = 4
Number of 3 = 5
Number of 1 = 3
Number of 6 = 1
Number of 4 = 1
Descending Sort of Symbols = 6, 5, 4, 3, 2, 1

compare the first word from 2 text files

I have these 2 csv files. The first word is the "key" and I need to compare the old file t1 with the new one t2. If the key match, I need to display the content from new file.
cat /tmp/t1.txt
a, b, c
d, e, f
g, m, p
cat /tmp/t2.txt
d, x, y
g, h, i
But the way this loop is written, it shows the entry from old file for the key 'g' while it works correctly for the key 'd'.
with open("/tmp/t1.txt", "r") as f:
with open("/tmp/t2.txt", "r") as n:
for nline in n:
for fline in f:
if nline.split()[0] == fline.split()[0]:
print("nline", nline)
else:
print("fline", fline)
The result is:
fline a, b, c
nline d, x, y
fline g, m, p
The last line should look like this:
nline g, h, i
Basically you have to reopen the file once you read through it:
with open("/tmp/t2.txt", "r") as n:
for nline in n:
with open("/tmp/t1.txt", "r") as f: # <<-- moved this line here
for fline in f:
# some comparisons here
Another good approach would be to read both files once, and then compare the data:
with open("/tmp/t1.txt", "r") as fin :
data1 = fin.readlines()
with open("/tmp/t2.txt", "r") as fin :
data2 = fin.readlines()
for nline in data1 :
for fline in data2 :
# put your logic here
ok, answering the question from the comments:
with open("/tmp/t1.txt", "r") as fin :
data1 = [ (i[0], i) for i in fin.readlines() if len(i) > 3 ]
with open("/tmp/t2.txt", "r") as fin :
data2 = { i[0] : i for i in fin.readlines() if len(i) > 3 }
for key,value in data1 :
print data2[key] if key in data2 else value
Using pandas
Caveat: keys in t2 are unique
Generates a dataframe with rows updated where the t1 key matches the t2 key
import pandas as pd
# read files
t1 = pd.read_csv('t1.txt', header=None)
t2 = pd.read_csv('t2.txt', header=None)
# t2 to a dict
t2_d = t2.set_index(0, drop=False).to_dict(orient='index')
# look for t1 in t2_d
t1.apply(lambda x: t2_d[x[0]] if x[0] in t2_d else x, axis=1)
0 1 2
0 a b c
1 d x y
2 g h i
Optionally
This option adds a column, in_t2 to indicate whether t1 is in t2
Create updated and join it to t1 to easily verify updated values
# read files
t1 = pd.read_csv('t1.txt', header=None)
t2 = pd.read_csv('t2.txt', header=None)
# add Boolean column
t1['in_t2'] = t1[0].isin(t2[0])
# t2 to a dict
t2_d = t2.set_index(0, drop=False).to_dict(orient='index')
# look for t1 in t2_d
updated = t1.iloc[:, :-1].apply(lambda x: t2_d[x[0]] if x[0] in t2_d else x, axis=1)
# join t1 and updated
pd.concat([t1, updated], axis=1)
0 1 2 in_t2 0 1 2
0 a b c False a b c
1 d e f True d x y
2 g m p True g h i

combine data from different files

I have multiple files for question purpose I am showing only two files:
TXT1
id value
1 4
2 4
4 5
TXT2
id value
2 6
3 5
5 3
desired output: first collect all the elements of id column from all 40 files and name the column header value_TXT1 (file name). if value found enter it else add 0.
id value_TXT1 value_TXT2
1 4 0
2 4 6
3 0 5
4 5 0
5 3 0
I have ~40 files in a directory of which i need to make a final table like this so my final table will have
id value_TXT1 value_TXT2........valueTXT40
Any pseudo code or tutorial would be helpful, apologies I have not tried anything as I am confused how to approach this.
EDIT:
this is what I have tried so far from different sources:
import glob
import os
data_dict = {}
path = '/Users/a/Desktop/combine/*.txt'
paths = '/Users/a/Desktop/combine/'
files=glob.glob(path)
filelist = os.listdir(paths) #Make a file list
file_names=[os.path.splitext(x)[0] for x in filelist] #header
print file_names
for file in files:
f=open(file, 'r')
f.readline()
for i in f:
(key, value) = i.split()
data_dict[key]=value
print data_dict
output:
['combine', 'combine2']
{'1': '4', '3': '5', '2': '4', '5': '3', '4': '5'}
two files called
combine.txt
id value
2 6
3 5
5 3
combine1.txt
id value
1 4
2 4
4 5
I assume:
files are in the same folder
they all start with "TXT"
The text is tab separated
Requirement: pandas
Input:
TXT1
1 4
2 3
3 5
4 3
7 5
TXT2
1 4
2 4
4 5
6 3
here the code:
import pandas as pd
import glob
path = "/my/full/path/"
file_list = glob.glob1(path, "TXT*")
res = pd.DataFrame()
for filename in file_list:
df = pd.read_csv(path+filename, header=None, sep=" ", index_col=0, names=["values_"+file])
res = pd.concat([res,df], axis=1)
res = res.fillna(0)
print res.astype(int)
Output:
values_TXT1 values_TXT2
1 4 4
2 3 4
3 5 0
4 3 5
6 0 3
7 5 0
also you can export it to csv again with:
res.to_csv("export.csv", sep=",")
you can find more parameters in the documentation
First parse all 40 files, and get a dictionary data_dict.
(pseudo code)
data_dict = {}
def parse_file(txt_i):
for id, value in data_rows:
if id not in data_dict:
data_dict[id] = [0 ... 0] # 40 zeros indicate the default values from each TXT file
data_dict[id][i] = value # set value of the ith TXT file
Then print out the content of data_dict in the format you want.
for id in data_dict:
print id
for value in data_dict[id]:
print value
Remember to take care of headers. (id value_TXT1 value_TXT2........valueTXT40)
Here I propose you a solution based on the following assumption:
1) the files are all tab separated or comma separated
2) comma appears only as a separator
3) all the files you want to process are in the same folder
Here it goes:
#1 make a list fo files to precess
import glob
folder = 'path_to_my_folder'
extension = '*.txt' #it can be *.*
files = glob.glob(folder + '/' + extension)
#2 initialize a dict
data = {}
#3 read all the files and update the dict
for n, file in enumerate(files):
with open(file, 'r') as f:
separator = False
for line in f:
if line[0] == 'E': #check for ID-containing lines
if ',' in line:
separator = ','
else:
separator = '\t'
id, value = line.strip().split(separator)
try:
data[id].append(value)
except KeyError:
data[id] = []
#fill with 0 the id not found on previous files
while len(data[id]) < n:
data[id].append(0)
data[id].append(value)
#fill with 0 the id not found on this file
for k,v in data.items(): #.iteritems() on python2
while len(v) < n+1: #if n=0 then len must be 1
data[k].append(0)
#print the result
#first line
print('id', end='')
for file in files:
print('\t{}'.format(file), end='')
#the rest
for k, v in data.items():
print('\n{}'.format(k), end='')
for item in v:
print('\t{}'.format(item), end='')
#to write it in a file
with open('myfile.txt' , 'w') as f:
#write header
f.write('id')
for file in files:
f.write('\t{}'.format(file))
f.write('\n') #go to the next line (optional)
for k, v in data.items():
f.write('\n{}'.format(k))
for item in v:
f.write('\t{}'.format(item))

Index Error: Index out of bounds when using numpy in python

I have a code that works fine when I have small CSV's of data but errors out when I try to run large CSV's through it. In essence this code is supposed to place 3 CSV's worth of data into 3 separate dictionaries, combine those dictionaries into a master dictionary, and then preform arithmetic operations on dictionary. The input CSV's look something like this:
time A B C D
0 3 4 6 4
.001 4 6 7 8
.002 4 6 7 3
The code that I am using is the code displayed below. The error occurs within the lines 47 and 65 where I am try to preform arithmetic with the dictionary. Any explanation as to why this is going on is greatly appreciated.
import numpy
Xcoord = {}
time = []
with open ('Nodal_QuardnetsX2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Xcoord[values[0]] = map(float, values[1:])
time.append(values[0])
Ycoord = {}
with open ('Nodal_QuardnetsY2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Ycoord[values[0]] = map(float, values[1:])
Zcoord = {}
with open ('Nodal_QuardnetsZ2.csv', 'r') as f:
f.readline() # Skips first line
for line in f:
values = [s.strip()for s in line.split(',')]
Zcoord[values[0]] = map(float, values[1:])
# Create a master dictionary of the form {'key':[[x, y, z], [x, y, z]}
CoordCombo = {}
for key in Xcoord.keys():
CoordnateList = zip(Xcoord[key], Ycoord[key], Zcoord[key])
CoordCombo[key] = CoordnateList
counter = 0
keycount1 = 0
keycount2 = 0.001
difference = []
NodalDisplacements = {}
#Find the difference between the x, y, and z quardnets relative to that point in time
while keycount2 <= float(values[0]):
Sub = numpy.subtract(CoordCombo[str(keycount2)][counter], CoordCombo[str(keycount1)][counter])
counter = counter + 1
difference.append(Sub)
NodalDisplacements[keycount1] = Sub
keycount1 = keycount1 + 0.001
keycount2 = keycount2 + 0.001
counter = 0
keycount3 = 0
keycount4 = 0.001
Sum = []
breakpoint = float(values[0])-0.001
while keycount4 <= breakpoint:
Add = numpy.sum(NodalDisplacements[keycount4][counter], NodalDisplacements[keycount3][counter])
Sum.append(Add)
keycount3 = keycount3 + 0.001
keycount4 = keycount4 + 0.001
counter = counter + 1
if counter == 2:
counter = 0
print Sum
probably a line of your csv file does not contain 5 elements or the line is empty.
In your logic I would suggest to use
for line in f:
line = line.strip()
if not line: continue
if len(values) != N_COLS: continue # or error...
# other ...

Delete and save duplicate in another file

In test.txt:
1 a
2 b
3 c
4 a
5 d
6 c
I want to remove duplicate and save the rest in test2.txt:
2 b
5 d
I tried to start with the codes below.
file1 = open('../test.txt').read().split('\n')
#file2 = open('../test2.txt', "w")
word = set()
for line in file1:
if line:
sline = line.split('\t')
if sline[1] not in word:
print sline[0], sline[1]
word.add(sline[1])
#file2.close()
The results from the codes showed:
1 a
2 b
3 c
5 d
Any suggestion?
You can use collections.Orderedict here:
>>> from collections import OrderedDict
with open('abc') as f:
dic = OrderedDict()
for line in f:
v,k = line.split()
dic.setdefault(k,[]).append(v)
Now dic looks like:
OrderedDict([('a', ['1', '4']), ('b', ['2']), ('c', ['3', '6']), ('d', ['5'])])
Now we only need those keys which contain only 1 items in the list.
for k,v in dic.iteritems():
if len(v) == 1:
print v[0],k
...
2 b
5 d
What you're doing is that you're just making sure every second item (letter) gets printed out only once. Which obviously is not what you're saying you want.
You must split your code into two halfs - reading and gathering statistics about letter counts, and part which prints only those which has count == 1.
Converting your original code (I just made it a little simpler):
file1 = open('../test.txt')
words = {}
for line in file1:
if line:
line_num, letter = line.split('\t')
if letter not in words:
words[letter] = [1, line_num]
else:
words[letter][0] += 1
for letter, (count, line_num) in words.iteritems():
if count == 1:
print line_num, letter
I tried to keep it as similar to your stlye as possible:
file1 = open('../test.txt').read().split('\n')
word = set()
test = []
duplicate = []
sin_duple = []
num_lines = 0;
num_duplicates = 0;
for line in file1:
if line:
sline = line.split(' ')
test.append(" ".join([sline[0], sline[1]]))
if (sline[1] not in word):
word.add(sline[1])
num_lines = num_lines + 1;
else:
sin_duple.append(sline[1])
duplicate.append(" ".join([sline[0], sline[1]]))
num_lines = num_lines + 1;
num_duplicates = num_duplicates + 1;
for i in range (0,num_lines+1):
for item in test:
for j in range(0, num_duplicates):
#print((str(i) + " " + str(sin_duple[j])))
if item == (str(i) + " " + str(sin_duple[j])):
test.remove(item)
file2 = open("../test2.txt", 'w')
for item in test:
file2.write("%s\n" % item)
file2.close()
How about some Pandas
import pandas as pd
a = pd.read_csv("test_remove_dupl.txt",sep=",")
b = a.drop_duplicates(cols="a")

Categories