Python program to read multiple files at a time - python

I have a 1000 files in a folder named md_1.mdp, md_2.mdp, ..., md_1000.mdp and the 186th line of every file reads:
gen_seed = 35086
This value is different in every file and it is what I want to extract and print as the output.
I have written the following code but it is not displaying any output.
import numpy as np
idx = np.arange(1,1000)
for i in idx:
f = open('/home/abc/xyz/mdp_200/md_'+str(i)+'.mdp','r')
l = f.readlines()
l = l[185].split(" ")
flag = 0
for k in l:
if flag==1:
if k!='':
print(k)
flag=0
if k=="t=":
flag=1
f.close()
What should I add to this program so that it prints the required value for each file one by one in the order of md_1.mdp, md_2.mdp and so on?

you can use:
for i in range(1, 1001):
with open('/home/abc/xyz/mdp_200/md_'+ str(i)+ '.mdp') as fp:
l = fp.readlines()
print(l[185].split('=')[-1].strip())
or you can use linecache.getline:
import linecache
for i in range(1, 1001):
file = f'/home/abc/xyz/mdp_200/md_{i}.mdp'
line = linecache.getline(file, 185)
print(line.split('=')[-1].strip())
after you get your line the split is done by = character

Related

Python - Nested Loops

I am having a problem with a loop in python to get the desired result. Here is my issue.
First, I have 1 text file: urls.txt. This file has multiple URLs.
Second, I have multiple json files. Lets say there are 5 json files.
I want to process first n lines of the urls.txt file with 1.json file and then next n lines of urls.txt file with 2.json file and so on. After all the 5 json files are used, I want to start from 1.json file again and repeat the process until all the lines in urls.txt files are processed.
In my case I wanted to rotate the json files after each 100 lines of urls.txt
I have written some code to do that but unfortunately, I am not able to figure out how to repeat the operation once all the json files are used.
batch_size = 100
JSON_KEY_FILE_PATH = "json_files/"
JSON_FILENAME = '*.json'
json_file_list = glob.glob(JSON_KEY_FILE_PATH + JSON_FILENAME, recursive=True)
itr_length = len(json_file_list)
from itertools import count
def UrlCall(URL_FILE):
with open(URL_FILE, mode='r') as urllist:
for j in range(0,itr_length):
for i in count():
line = urllist.readlines(20)
print ('===>' + str(i) + '===>' + str(line))
if (i/batch_size).is_integer() and line != '' and i != 0 and j != itr_length:
# #define the json message
print ("Value of J" + str(j))
print ("JSON FILE IN USE:" + str(json_file_list[j]))
if j == itr_length-1:
print ("====>RESTARTING JSON EXECUTION")
time.sleep(10)
print ("Value of J" + str(j))
print ('===>' + str(i) + '===>' + str(line))
print ("JSON FILE IN USE:" + str(json_file_list[j]))
return
break
This code is existing after after all the json files are used. But I want to restart using the json files range again and process the next n line in urls.txt file.
You can use itertools.cycle(json_file_list) to loop through the list repeatedly.
You can use one of the techniques in What is the most "pythonic" way to iterate over a list in chunks? to iterate over the file in groups of N lines.
Then you can zip them to process them together.
from itertools import cycle, zip_longest
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return zip_longest(*args, fillvalue=fillvalue)
def url_call(url_file, json_file_list):
with open(url_file) as f:
for json_file, lines in zip(cycle(json_file_list), grouper(f, batch_size)):
json_data = json.load(open(json_file))
for line in lines:
if line:
# do something with line and json_data

How to read data corresponds to specific line numbers from a 60GB text file in python?

I have a text file (1 Billion lines) of 60GB size. I have to extract data corresponds to specified line numbers which can be read from another text file (eg:1, 4, 70, 100...etc). Due to the size I can't load data to memory and then extract lines. Also, line by line matching and extraction would take many days of time. Is there any solution exist for this problem?
2 methods which I tried:
1. first method
f = open('line_numbers.txt')
lines = f.readlines()
numbers =[int(e.strip()) for e in lines]
r = max(numbers)
file = open('OUTPUT_RESULT.txt','w')
with open('Large_File.txt') as infile:
for num, line in enumerate(infile,1):
if (num<= r):
if (num in numbers):
file.write(line)
else:
pass
print(num)
It will take many days to get the result
2. second method
import pandas as pd
data = pd.read_csv('Large_File.txt', header=None)
file = open('OUTPUT_RESULT.txt','w')
f = open('line_numbers.txt')
lines = f.readlines()
numbers =[int(e.strip()) for e in lines]
x = data.loc[numbers,:]
file.write(x)
It does not load file to memory
Is there any solution available to resolve this?
Your issue is probably with the if (num in numbers) line. Not only does it not need the parentheses, but it also checks this for every iteration, even though your code goes through the file in order (first line 1, then line 2, etc.).
That can be easily optimised and doing so, the code below ran in only 12 seconds on a test file of about 50 million lines. It should process your file in a few minutes.
import random
numbers = sorted([random.randint(1, 50000000) for _ in range(1000)])
outfile = open('specific_lines.txt', 'w')
with open('archive_list.txt', 'r', encoding='cp437') as infile:
for num, line in enumerate(infile, 1):
if numbers:
if num == numbers[0]:
outfile.write(line)
print(num)
del numbers[0]
else:
pass
Note: this generates a 1,000 random line numbers, replace with your loaded numbers like in your example. If your list of number is far greater, the write time for the output file will increase execution time somewhat.
Your code would be like:
with open('line_numbers.txt') as f:
lines = f.readlines()
numbers = sorted([int(e.strip()) for e in lines])
outfile = open('specific_lines.txt', 'w')
with open('archive_list.txt', 'r', encoding='cp437') as infile:
for num, line in enumerate(infile, 1):
if numbers:
if num == numbers[0]:
outfile.write(line)
print(num)
del numbers[0]
else:
pass

Dynamically splitting a file into multiple smaller ones

I'm trying to split up a very large text file into multiple smaller ones. When I run the code below, the first created file is correct. Everything after that just contains the 'INSERT INTO ...' string and nothing else. Thanks in advance
import math
interval = 100000
with open('my-big-file','r') as c:
for i, l in enumerate(c):
pass
length = i + 1
numOfFiles = int(math.ceil(length / interval))
with open('my-big-file','r') as c:
for j in range(0, numOfFiles):
with open('my-smaller-file_{}.sql'.format(j),'w') as n:
print >> n, 'INSERT INTO codes (code, some-field, some-other-field) VALUES'
for i, line in enumerate(c):
if i >= j * interval and i < (j + 1) * interval:
line = line.rstrip()
if not line: continue
print >> n, '(%s,'something','something else'),' % (line)
else:
break
You don't need to count the number of lines before iterating the file, you can directly write to a new file whenever you reach the number of given lines:
#!/usr/bin/env python
def split(fn, num=1000, suffix="_%03d"):
import os
full, ext = os.path.splitext(fn)
with open(fn, 'r') as f:
for i, l in enumerate(f):
if i%num == 0:
try:
out.close()
except UnboundLocalError:
pass
out = open(full+suffix%(i/num)+ext, 'w')
out.write(l)
else:
out.close()
if __name__ == '__main__':
import sys
split(sys.argv[1])
You can run this from the command line. Though probably the split command is more useful, since it supports a multitude of options.
It's also possible to rewrite this code to also use with for the file(s) being written to, but that's another topic.

"list index out of range" when try to output lines from a text file using python

I was trying to extract even lines from a text file and output to a new file. But with my codes python warns me "list index out of range". Anyone can help me? THANKS~
Code:
f = open('input.txt', 'r')
i = 0
j = 0
num_lines = sum(1 for line in f)
newline = [0] * num_lines
print (num_lines)
for i in range(1, num_lines):
if i % 2 == 0:
newline[i] = f.readlines()[i]
print i, newline[i]
i = i + 1
f.close()
f = open('output.txt', 'w')
for j in range(0,num_lines):
if j % 2 == 0:
f.write(newline[j] + '\n')
j = j + 1
f.close()
Output:
17
Traceback (most recent call last):
File "./5", line 10, in <module>
a = f.readlines()[1]
IndexError: list index out of range
After
num_lines = sum(1 for line in f)
The file pointer in f is at the end of the file. Therefore any subsequent call of f.readlines() gives an empty list. The minimal fix is to use f.seek(0) to return to the start of the file.
However, a better solution would be to read through the file only once, e.g. using enumerate to get the line and its index i:
newline = []
for i, line in enumerate(f):
if i % 2 == 0:
newline.append(line)
In your original script you read the file once to scan the number of lines, then you (try to) read the lines in memory, you needlessly create a list for the full size instead of just extending it with list.append, you initialize the list with zeroes which does not make sense for a list containing strings, etc.
Thus, this script does what your original idea was, but better and simpler and faster:
with open('input.txt', 'r') as inf, open('output.txt', 'w') as outf:
for lineno, line in enumerate(inf, 1):
if lineno % 2 == 0:
outf.write(line)
Specifically
open the files with with statement so that they are automatically closed when
the block is exited.
write as they are read
as lines are numbered 1-based, use the enumerate with the start value 1 so that you truly get the even numbered lines.
You've also got the itertools.islice approach available:
from itertools import islice
with open('input') as fin, open('output', 'w') as fout:
fout.writelines(islice(fin, None, None, 2))
This saves the modulus operation and puts the line writing to system level.

generating a single outfile after analyzing multiple files in python

i have multiple files each containing 8/9 columns.
for a single file : I have to read last column containing some value and count the number of occurrence of each value and then generate an outfile.
I have done it like:
inp = open(filename,'r').read().strip().split('\n')
out = open(filename,'w')
from collections import Counter
C = Counter()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
now the problem is it works fine if I have to generate one output for one input. But I need to scan a directory using glob.iglobfunction for all files to be used as input. And then have to perform above said program on each file to gather result for each file and then of course have to write all of the analyzed results for each file into a single OUTPUT file.
NOTE: During generating single OUTPUT file if any value is found to be getting repeated then instead of writing same entry twice it is preferred to sum up the 'count' only. e.g. analysis of 1st file generate:
123 6
111 5
0 6
45 5
and 2nd file generate:
121 9
111 7
0 1
22 2
in this case OUTPUT file must be written such a way that it contain:
123 6
111 12 #sum up count no. in case of similar value entry
0 7
45 5
22 2
i have written prog. for single file analysis BUT i'm stuck in mass analysis section.
please help.
from collections import Counter
import glob
out = open(filename,'w')
g_iter = glob.iglob('path_to_dir/*')
C = Counter()
for filename in g_iter:
f = open(filename,'r')
inp = f.read().strip().split('\n')
f.close()
for line in inp:
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()
After de-uglification:
from collections import Counter
import glob
def main():
# create Counter
cnt = Counter()
# collect data
for fname in glob.iglob('path_to_dir/*.dat'):
with open(fname) as inf:
cnt.update(line.split()[-1] for line in inf)
# dump results
with open("summary.dat", "w") as outf:
outf.writelines("{:5s} {:>5d}\n".format(val,num) for val,num in cnt.iteritems())
if __name__=="__main__":
main()
Initialise a empty dictionary at the top of the program,
lets say, dic=dict()
and for each Counter update the dic so that the values of similar keys are summed and the new keys are also added to the dic
to update dic use this:
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
where C is the current Counter, and after all files are finished write the dic to the output file.
import glob
from collections import Counter
dic=dict()
g_iter = glob.iglob(r'c:\\python32\fol\*')
for x in g_iter:
lis=[]
with open(x) as f:
inp = f.readlines()
for line in inp:
num=line.split()[-1]
lis.append(num)
C=Counter(lis)
dic=dict( (n, dic.get(n, 0)+C.get(n, 0)) for n in set(dic)|set(C) )
for x in dic:
print(x,'\t',dic[x])
I did like this.
import glob
out = open("write.txt",'a')
from collections import Counter
C = Counter()
for file in glob.iglob('temp*.txt'):
for line in open(file,'r').read().strip().split('\n'):
k = line.split()[-1] #as to read last column
C[k] += 1
for value,count in C.items():
x = "%s %d" % (value,count)
out.write(x)
out.write('\n')
out.close()

Categories