combine data from different files

combine data from different files - python

I have multiple files for question purpose I am showing only two files:
TXT1
id value
1 4
2 4
4 5
TXT2
id value
2 6
3 5
5 3
desired output: first collect all the elements of id column from all 40 files and name the column header value_TXT1 (file name). if value found enter it else add 0.
id value_TXT1 value_TXT2
1 4 0
2 4 6
3 0 5
4 5 0
5 3 0
I have ~40 files in a directory of which i need to make a final table like this so my final table will have
id value_TXT1 value_TXT2........valueTXT40
Any pseudo code or tutorial would be helpful, apologies I have not tried anything as I am confused how to approach this.
EDIT:
this is what I have tried so far from different sources:
import glob
import os
data_dict = {}
path = '/Users/a/Desktop/combine/*.txt'
paths = '/Users/a/Desktop/combine/'
files=glob.glob(path)
filelist = os.listdir(paths) #Make a file list
file_names=[os.path.splitext(x)[0] for x in filelist] #header
print file_names
for file in files:
f=open(file, 'r')
f.readline()
for i in f:
(key, value) = i.split()
data_dict[key]=value
print data_dict
output:
['combine', 'combine2']
{'1': '4', '3': '5', '2': '4', '5': '3', '4': '5'}
two files called
combine.txt
id value
2 6
3 5
5 3
combine1.txt
id value
1 4
2 4
4 5

I assume:
files are in the same folder
they all start with "TXT"
The text is tab separated
Requirement: pandas
Input:
TXT1
1 4
2 3
3 5
4 3
7 5
TXT2
1 4
2 4
4 5
6 3
here the code:
import pandas as pd
import glob
path = "/my/full/path/"
file_list = glob.glob1(path, "TXT*")
res = pd.DataFrame()
for filename in file_list:
df = pd.read_csv(path+filename, header=None, sep=" ", index_col=0, names=["values_"+file])
res = pd.concat([res,df], axis=1)
res = res.fillna(0)
print res.astype(int)
Output:
values_TXT1 values_TXT2
1 4 4
2 3 4
3 5 0
4 3 5
6 0 3
7 5 0
also you can export it to csv again with:
res.to_csv("export.csv", sep=",")
you can find more parameters in the documentation

First parse all 40 files, and get a dictionary data_dict.
(pseudo code)
data_dict = {}
def parse_file(txt_i):
for id, value in data_rows:
if id not in data_dict:
data_dict[id] = [0 ... 0] # 40 zeros indicate the default values from each TXT file
data_dict[id][i] = value # set value of the ith TXT file
Then print out the content of data_dict in the format you want.
for id in data_dict:
print id
for value in data_dict[id]:
print value
Remember to take care of headers. (id value_TXT1 value_TXT2........valueTXT40)

Here I propose you a solution based on the following assumption:
1) the files are all tab separated or comma separated
2) comma appears only as a separator
3) all the files you want to process are in the same folder
Here it goes:
#1 make a list fo files to precess
import glob
folder = 'path_to_my_folder'
extension = '*.txt' #it can be *.*
files = glob.glob(folder + '/' + extension)
#2 initialize a dict
data = {}
#3 read all the files and update the dict
for n, file in enumerate(files):
with open(file, 'r') as f:
separator = False
for line in f:
if line[0] == 'E': #check for ID-containing lines
if ',' in line:
separator = ','
else:
separator = '\t'
id, value = line.strip().split(separator)
try:
data[id].append(value)
except KeyError:
data[id] = []
#fill with 0 the id not found on previous files
while len(data[id]) < n:
data[id].append(0)
data[id].append(value)
#fill with 0 the id not found on this file
for k,v in data.items(): #.iteritems() on python2
while len(v) < n+1: #if n=0 then len must be 1
data[k].append(0)
#print the result
#first line
print('id', end='')
for file in files:
print('\t{}'.format(file), end='')
#the rest
for k, v in data.items():
print('\n{}'.format(k), end='')
for item in v:
print('\t{}'.format(item), end='')
#to write it in a file
with open('myfile.txt' , 'w') as f:
#write header
f.write('id')
for file in files:
f.write('\t{}'.format(file))
f.write('\n') #go to the next line (optional)
for k, v in data.items():
f.write('\n{}'.format(k))
for item in v:
f.write('\t{}'.format(item))

Related

Python: How to read space delimited data with different length in text file and parse it

I have space delimited data in a text file look like the following:
0 1 2 3
1 2 3
3 4 5 6
1 3 5
1
2 3 5
3 5
each line has different length.
I need to read it starting from line 2 ('1 2 3')
and parse it and get the following information:
Number of unique data = (1,2,3,4,5,6)=6
Count of each data:
count data (1)=3
count data (2)=2
count data (3)=5
count data (4)=1
count data (5)=4
count data (6)=1
Number of lines=6
Sort the data in descending order:
data (3)
data (5)
data (1)
data (2)
data (4)
data (6)
I did this:
file=open('data.txt')
csvreader=csv.reader(file)
header=[]
header=next(csvreader)
print(header)
rows=[]
for row in csvreader:
rows.append(row)
print(rows)
After this step, what should I do to get the expected results?

I would do something like this:
from collections import Counter
with open('data.txt', 'r') as file:
lines = file.readlines()
lines = lines[1:] # skip first line
data = []
for line in lines:
data += line.strip().split(" ")
counter = Counter(data)
print(f'unique data: {list(counter.keys())}')
print(f'count data: {list(sorted(counter.most_common(), key=lambda x: x[0]))}')
print(f'number of lines: {len(lines)}')
print(f'sort data: {[x[0] for x in counter.most_common()]}')

A simple brute force approach:
nums = []
counts = {}
for row in open('data.txt'):
if row[0] == '0':
continue
nums.extend( [int(k) for k in row.rstrip().split()] )
print(nums)
for n in nums:
if n not in counts:
counts[n] = 1
else:
counts[n] += 1
print(counts)
ordering = list(sorted(counts.items(), key=lambda k: -k[1]))
print(ordering)

Here is another approach
def getData(infile):
""" Read file lines and return lines 1 thru end"""
lnes = []
with open(infile, 'r') as data:
lnes = data.readlines()
return lnes[1:]
def parseData(ld):
""" Parse data and print desired results """
unique_symbols = set()
all_symbols = dict()
for l in ld:
symbols = l.strip().split()
for s in symbols:
unique_symbols.add(s)
cnt = all_symbols.pop(s, 0)
cnt += 1
all_symbols[s] = cnt
print(f'Number of Unique Symbols = {len(unique_symbols)}')
print(f'Number of Lines Processed = {len(ld)}')
for symb in unique_symbols:
print(f'Number of {symb} = {all_symbols[symb]}')
print(f"Descending Sort of Symbols = {', '.join(sorted(list(unique_symbols), reverse=True))}")
On executing:
infile = r'spaced_text.txt'
parseData(getData(infile))
Produces:
Number of Unique Symbols = 6
Number of Lines Processed = 6
Number of 2 = 2
Number of 5 = 4
Number of 3 = 5
Number of 1 = 3
Number of 6 = 1
Number of 4 = 1
Descending Sort of Symbols = 6, 5, 4, 3, 2, 1

Compare two files with input from third file and write the biggest count to the fourth file

I have three files:
file:1
mango
banana
orange
file:2 -> the count is in string, because when I wrote to file:2 -> write() only let me write strings.
mango 2
banana 3
file:3 -> the count is in string, because when I wrote to file:3 -> write() only let me write strings.
banana 4
orange 3
I want to take file:1 and check with file:2 & file:3. If they are present, I want to take the entry with the biggest count and write to file:4.
Expected output in file:4
mango 2
banana 4
orange 3
I tried writing file:2 and file:3 to a dictionary and do a dictionary compare, but I am getting lost with two many open() files.
I am new to python. Not being able to write an integer to file with write() itself threw me off.
Appreciate your help/hint.

Following produces file4 from file1, file2, file3;
def load_file(filepath):
" Loads the files as dictionary "
with open(filepath, 'r') as f:
return dict(line.rstrip().split() for line in f)
# Get keys
with open('file1.txt') as file1:
keys = [line.rstrip() for line in file1]
# Produce output (file4)
with open('file4.txt', 'w') as file_out:
dic1 = load_file('file2.txt')
dic2 = load_file('file3.txt')
for k in keys:
v1 = int(dic1.get(k, 0)) # convert dic counts to int)
v2 = int(dic2.get(k, 0)) # (use default to 0 if not present)
v = max(v1, v2)
if v > 0: # only write if count > 0
file_out.write(f"{k} {v}\n")

This works for as many files as you want for input.
values = {}
def func(file):
number_of_lines = file.readlines()
for line in number_of_lines:
elements = line.split()
if (elements[0] in values):
if (int(elements[1]) > int(values[elements[0]])):
values[elements[0]] = elements[1]
else:
values[elements[0]] = 0
file.close()
f = open("1.txt", "r")
func(f)
f = open("2.txt", "r")
func(f)
f = open("3.txt", "r")
func(f)
f = open("4.txt", "w+")
for key, val in values.items():
print (key, " ", val)
to_write = key + " " + val + "\n"
f.write(to_write)
f.close()

How to extract key and value from a text file

I would like to extract key and value from an existing text file. Key in a separate variable and value in a separate variable.
The text file (sample.txt) contains the below content,
one:two
three:four
five:six
seven:eight
nine:ten
sample:demo
I am able to read the content from the text file, but i am not able to proceed further to extract key and value.
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
x=0
for line in sp:
sp.split(":")[x].strip()
x+=1
The above only extracts the value and also provides index out of range exception at the end.
If we iterate through the file, i am expecting the output as below,
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
key 4 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten

This should work:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
lines = sp.split("\n")
for line in lines:
# print("line:[{0}]".format(line))
parts = line.split(":")
print("key:[{0}], value:[{1}]".format(parts[0], parts[1]))

It can work:
sp = open ("sampletxt.txt", "r")
x=0
key=[]
value=[]
try:
while True:
text_line = sp.readline()
if text_line:
text_line = ''.join(text_line)
text_line = text_line.split()
text_line = ''.join(text_line).split(':')
key.append(text_line[0])
value.append(text_line[1])
x += 1
else:
for i in range(x):
print("Key {} = {}".format(i,key[i]))
print("")
for i in range(x):
print("Value {} = {}".format(i,value[i]))
break
finally:
sp.close()
The output is:
Key 0 = one
Key 1 = three
Key 2 = five
Key 3 = seven
Key 4 = nine
Key 5 = sample
Value 0 = two
Value 1 = four
Value 2 = six
Value 3 = eight
Value 4 = ten
Value 5 = demo
which is similar to your request

Why don't you try:
with open ("sampletxt.txt", "r") as hfile:
sp = hfile.read()
print (sp)
dictionary = {}
for x, line in enumerate(sp):
line_list = sp.split(":")
dictionary[line_list[0]]=line_list[1]

You should always check if split returns two members (or any number you expect) before using the indexes.

splitting dictionary and writing it to different csv file in python

I want to split the python dictionary and write it to different files based on NO_OF_LINES_PER_FILE and size of dictionary
Input
NO_OF_LINES_PER_FILE
so if NO_OF_LINES_PER_FILE = 2 and size of dictionary is 10 the i want the dictionary to be splitted into 5 files(each file will have 2 rows)
Script
import csv
NO_OF_LINES_PER_FILE = 2
s = {"2222":["1","2","3"],"3456":["2","3","4"],"5634":["4","5"],"23543":["456","3"],"29587":["4","5"],"244":["23","34"],"455":["3","4"],"244221":["5"],"23232345":["2323","43"]}
def again(c,h,NO_OF_LINES_PER_FILE1):
f3 = open('num_'+str(h)+'.csv', 'at')
if c == 1:
ceh = 2
else:
ceh = c
print ceh
v = 0
for w in s:
v = v + 1
if v < ceh:
pass
elif v > NO_OF_LINES_PER_FILE1:
print "yes"
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE1 + 1
h = NO_OF_LINES_PER_FILE1 + 1
again(c,h,NO_OF_LINES_PER_FILE1)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
def split():
f3 = open('has_'+str(NO_OF_LINES_PER_FILE)+'.csv', 'at')
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
c = 0
for w in s:
if c >= NO_OF_LINES_PER_FILE:
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE + 1
h = NO_OF_LINES_PER_FILE
again(c,h,NO_OF_LINES_PER_FILE1)
break
else:
#print NO_OF_LINES_PER_FILE
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
split()
But this script is not working and creates many files
In the above script NO_OF_LINES_PER_FILE = 2 and size of dictionary s is 9
so i want 5 files first four file will contain 2 rows each and fifth file will contain 1 row
How can i solve this problem?

my method is flat the dict first, then split flat dict to sub list with length you want
import csv
flatDict = [ i for i in s.items()]
splitFlatDict = [flatDict[i:i+NO_OF_LINES_PER_FILE] for i in xrange(0,len(flatDict),NO_OF_LINES_PER_FILE)]
for i,rows in enumerate(splitFlatDict):
with open(str(i) + '.csv','wb') as f:
writer = csv.writer(f)
writer.writerows(rows)

generating a single outfile after analyzing multiple files in python

i have multiple files each containing 8/9 columns.
for a single file : I have to read last column containing some value and count the number of occurrence of each value and then generate an outfile.
I have done it like:
inp = open(filename,'r').read().strip().split('\n')
out = open(filename,'w')
from collections import Counter
C = Counter()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
now the problem is it works fine if I have to generate one output for one input. But I need to scan a directory using glob.iglobfunction for all files to be used as input. And then have to perform above said program on each file to gather result for each file and then of course have to write all of the analyzed results for each file into a single OUTPUT file.
NOTE: During generating single OUTPUT file if any value is found to be getting repeated then instead of writing same entry twice it is preferred to sum up the 'count' only. e.g. analysis of 1st file generate:
123 6
111 5
0 6
45 5
and 2nd file generate:
121 9
111 7
0 1
22 2
in this case OUTPUT file must be written such a way that it contain:
123 6
111 12 #sum up count no. in case of similar value entry
0 7
45 5
22 2
i have written prog. for single file analysis BUT i'm stuck in mass analysis section.
please help.

from collections import Counter
import glob
out = open(filename,'w')
g_iter = glob.iglob('path_to_dir/*')
C = Counter()
for filename in g_iter:
f = open(filename,'r')
inp = f.read().strip().split('\n')
f.close()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

After de-uglification:
from collections import Counter
import glob
def main():
# create Counter
cnt = Counter()
# collect data
for fname in glob.iglob('path_to_dir/*.dat'):
with open(fname) as inf:
cnt.update(line.split()[-1] for line in inf)
# dump results
with open("summary.dat", "w") as outf:
outf.writelines("{:5s} {:>5d}\n".format(val,num) for val,num in cnt.iteritems())
if __name__=="__main__":
main()

Initialise a empty dictionary at the top of the program,
lets say, dic=dict()
and for each Counter update the dic so that the values of similar keys are summed and the new keys are also added to the dic
to update dic use this:
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
where C is the current Counter, and after all files are finished write the dic to the output file.
import glob
from collections import Counter
dic=dict()
g_iter = glob.iglob(r'c:\\python32\fol\*')
for x in g_iter:
lis=[]
with open(x) as f:
inp = f.readlines()
for line in inp:
num=line.split()[-1]
lis.append(num)
C=Counter(lis)
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
for x in dic:
print(x,'\t',dic[x])

I did like this.
import glob
out = open("write.txt",'a')
from collections import Counter
C = Counter()
for file in glob.iglob('temp*.txt'):
for line in open(file,'r').read().strip().split('\n'):
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

combine data from different files - python

Related

Python: How to read space delimited data with different length in text file and parse it

Compare two files with input from third file and write the biggest count to the fourth file

How to extract key and value from a text file

splitting dictionary and writing it to different csv file in python

generating a single outfile after analyzing multiple files in python

Categories

Resources