How to run a code for multiple fastq files? - python

I would run the following code for multiple fastq files in a folder. In a folder I have different fastq files; first I have to read one file and perform the required operations, then store results in a separate file. fastq and then read second file, perform the same operations and save results in new 2nd file.fastq. Repeat the same procedure for all the files in the folder.
How can I do? Can someone suggest me a way to this this?
from Bio.SeqIO.QualityIO import FastqGeneralIterator
fout=open("prova_FiltraN_CE_filt.fastq","w")
fin=open("prova_FiltraN_CE.fastq","rU")
maxN=0
countall=0
countincl=0
with open("prova_FiltraN_CE.fastq", "rU") as handle:
for (title, sequence, quality) in FastqGeneralIterator(handle):
countN = sequence.count("N", 0, len(sequence))
countall+=1
if countN==maxN:
fout.write("#%s\n%s\n+\n%s\n" % (title, sequence, quality))
countincl+=1
fin.close
fout.close
print countall, countincl

I think the following will do what you want. What I did was make your code into a function (and modified it to be what I think is more correct) and then called that function for every .fastq file found in the designated folder. The output file names are generated from the input files found.
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import glob
import os
def process(in_filepath, out_filepath):
maxN = 0
countall = 0
countincl = 0
with open(in_filepath, "rU") as fin:
with open(out_filepath, "w") as fout:
for (title, sequence, quality) in FastqGeneralIterator(fin):
countN = sequence.count("N", 0, len(sequence))
countall += 1
if countN == maxN:
fout.write("#%s\n%s\n+\n%s\n" % (title, sequence, quality))
countincl += 1
print os.path.split(in_filepath)[1], countall, countincl
folder = "/path/to/folder" # folder to process
for in_filepath in glob.glob(os.path.join(folder, "*.fastq")):
root, ext = os.path.splitext(in_filepath)
if not root.endswith("_filt"): # avoid processing existing output files
out_filepath = root + "_filt" + ext
process(in_filepath, out_filepath)

Related

Extracting a diffrentiating numerical value from multiple files - PowerShell/Python

I have multiple text files containing different text.
They all contain a single appearance of the same 2 lines I am interested in:
================================================================
Result: XX/100
I am trying to write a script to collect all those XX values (numerical values between 0 and 100), and paste them in a CSV file with the text file name in column A and the numerical value in column B.
I have considered using Python or PowerShell for this purpose.
How can I identify the line where "Result" appears under the string of "===..", collect its content until '\n', and then strip it from "Result: " and "/100"?
"Result" and other numerical values could appear in the files, but never in the quoted format, and below "=====", like the line im interested in.
Thank you!
Edit: I have written this poor naive attempt to collect the numerical values.
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
for filename in os.listdir(dir_path):
if filename.endswith(".txt"):
with open(filename,"r") as f:
lineFound=False
for index, line in enumerate(f):
if lineFound:
line=line.replace("Result: ", "")
line=line.replace("/100","")
line.strip()
grade=line
lineFound=False
print(grade, end='')
continue
if index>3:
if "================================================================" in line:
lineFound=True
I'd still be happy to learn if there's a simple way to do this with PowerShell tbh
For the output, I used csv writer to append the results to a file one by one.
So there's two steps involved here, first is to get a list of files. There's a ton of answers for that one on stackoverflow, but this one is stupidly complete.
Once you have the list of files, you can simply just load the files themselves one by one, and then do some simple string.split() to get the value you want.
Finally, write the results into a CSV file. Since the CSV file is a simple one, you don't need to use the CSV library for this.
See the code example below. Note that I copied/pasted the function for generating the list of files from my personal github repo. I reuse that one a lot.
import os
def get_files_from_path(path: str = ".", ext:str or list=None) -> list:
"""Find files in path and return them as a list.
Gets all files in folders and subfolders
See the answer on the link below for a ridiculously
complete answer for this.
https://stackoverflow.com/a/41447012/9267296
Args:
path (str, optional): Which path to start on.
Defaults to '.'.
ext (str/list, optional): Optional file extention.
Defaults to None.
Returns:
list: list of file paths
"""
result = []
for subdir, dirs, files in os.walk(path):
for fname in files:
filepath = f"{subdir}{os.sep}{fname}"
if ext == None:
result.append(filepath)
elif type(ext) == str and fname.lower().endswith(ext.lower()):
result.append(filepath)
elif type(ext) == list:
for item in ext:
if fname.lower().endswith(item.lower()):
result.append(filepath)
return result
filelist = get_files_from_path("path/to/files/", ext=".txt")
split1 = "================================================================\nResult: "
split2 = "/100"
with open("output.csv", "w") as outfile:
outfile.write('filename, value\n')
for filename in filelist:
with open(filename) as infile:
value = infile.read().split(split1)[1].split(split2)[0]
print(value)
outfile.write(f'"{filename}", {value}\n')
You could try this.
In this example the filename written to the CSV will be its full (absolute) path. You may just want the base filename.
Uses the same, albeit seemingly unnecessary, mechanism for deriving the source directory. It would be unusual to have your Python script in the same directory as your data.
import os
import glob
equals = '=' * 64
dir_path = os.path.dirname(os.path.realpath(__file__))
outfile = os.path.join(dir_path, 'foo.csv')
with open(outfile, 'w') as csv:
print('A,B', file=csv)
for file in glob.glob(os.path.join(dir_path, '*.txt')):
prev = None
with open(file) as indata:
for line in indata:
t = line.split()
if len(t) == 2 and t[0] == 'Result:' and prev.startswith(equals):
v = t[1].split('/')
if len(v) == 2 and v[1] == '100':
print(f'{file},{v[0]}', file=csv)
break
prev = line

python script to merge more than 200 very large csv very in just one

I have been trying to merge several .csv files form different subfolders (all with the same name) into one. I tried with R but I got the result of not enough memory for carry the process (it should merge more than 20 million of rows). I am now working with a python script to try to get it (see below) it has many columns too so I don't need all of them but also I dont know if can choose which columns to add to the new csv:
import glob
import csv
import os
path= 'C:\\path\\to\\folder\\where\\all\\files\\are-allowated-in-subfolders'
result = glob.glob('*/certificates.csv')
#for i in result:
#full_path = "C:\\path\\to\\folder\\where\\all\\files\\are-allowated-in-subfolders\\" + result
#print(full_path)
os.chdir(path)
i=0
for root, directories, files in os.walk(path, topdown=False):
for name in files:
print(name)
try:
i += 1
if i % 10000 == 0:
#just to see the progress
print(i)
if name == 'certificates.csv':
creader = csv.reader(open(name))
cwriter = csv.writer(open('processed_' + name, 'w'))
for cline in creader:
new_line = [val for col, val in enumerate(cline)]
cwriter.writerow(new_line)
except:
print('problem with file: ' + name)
pass
but it doesn't work, and neither return any error so at the moment I am completely stuck.
Your indentation is wrong, and you are overwriting the output file for each new input file. Also, you are not using the glob result for anything. If the files you want to read are all immediately in subdirectories of path, you can do away with the os.walk() call and do the glob after you os.chdir().
import glob
import csv
import os
# No real need to have a variable for this really
path = 'C:\\path\\to\\folder\\where\\all\\files\\are-allowated-in-subfolders'
os.chdir(path)
# Obviously, can't use input file name in output file name
# because there is only one output file for many input files
with open('processed.csv', 'w') as dest:
cwriter = csv.writer(dest)
for i, name in enumerate(glob.glob('*/certificates.csv'), 1):
if i % 10000 == 0:
#just to see the progress
print(i)
try:
with open(name) as csvin:
creader = csv.reader(csvin)
for cline in creader:
# no need to enumerate fields
cwriter.writerow(cline)
except:
print('problem with file: ' + name)
You probably just need to keep a merged.csv file open whilst reading in each of the certificates.csv files. glob.glob() can be used to recursively find all suitable files:
import glob
import csv
import os
path = r'C:\path\to\folder\where\all\files\are-allowated-in-subfolders'
os.chdir(path)
with open('merged.csv', 'w', newline='') as f_merged:
csv_merged = csv.writer(f_merged)
for filename in glob.glob(os.path.join(path, '*/certificates.csv'), recursive=True):
print(filename)
try:
with open(filename) as f_csv:
csv_merged.writerows(csv.reader(f_csv))
except:
print('problem with file: ', filename)
An r prefix can be added to your path to avoid needing to escape each backslash. Also newline='' should be added to the open() when using a csv.writer() to stop extra blank lines being written to your output file.

Loop through sub-folders with Python on Mac to check if file exists, open, copy, paste then close file

I have a folder, with numerous (~200) sub-folders on my Mac desktop.
Some (but not all) sub-folders contain a csv file named "sample.csv".
Further, I have a "aggregate.csv" file where I would like to copy the 2nd column of each "sample.csv" data into.
Structure:
"/Desktop/folder"
"/Desktop/folder/aggregate.csv"
"/Desktop/folder/sub-folder"
"/Desktop/folder/sub-folder/sample.csv"
Using Python, how can I loop though each sub-folder, check if "sample.csv" exists, open it, copy the 2nd column, paste this column into the "aggregate.csv" file, close "sample.csv", then move on to the next sub-folder?
In "aggregate.csv", the copied data should increment to the right, so it doesn't overwrite the previous "sample.csv" data that has just been pasted.
My computer is opening the CSV files with Excel, that's why I refer to the "2nd column".
Many thanks
$ cd ~
$ more aggregate.csv
X
X
X
X
X
X
$ more ./Desktop/folder/sub-folder/sample.csv
A,1
A,2
A,3
A,4
A,5
$ more ./Desktop/folder/sub-folder/sub-sub-folder/sample.csv
B,6
B,7
B,8
B,9
$ more ./Desktop/folder/sub-folder2/sample.csv
C,10
C,11
C,12
C,13
C,14
C,15
C,16
$ more ./Desktop/folder/sub-folder3/sub-sub-folder/sample.csv
D,17
D,18
D,19
$ python3 aggregate_samples.py ./Desktop
./Desktop/folder/sub-folder/sample.csv
./Desktop/folder/sub-folder/sub-sub-folder/sample.csv
./Desktop/folder/sub-folder2/sample.csv
./Desktop/folder/sub-folder3/sub-sub-folder/sample.csv
$ cat aggregate.csv
X,1,6,10,17
X,2,7,11,18
X,3,8,12,19
X,4,9,13,
X,5,,14,
X,,,15,
,,,16,
Here is the code that accomplishes this. The key technologies you need: os.walk() to recursively search the folders, the csv module to read in the sample.csv files (and get the 2nd column), lists to accumulate the samples, and csv again to write out the result. I assumed your sample.csv files will be different lengths, and so the code handles that (by pre-allocating a sparse matrix).
This assumes your dataset is small enough to fit into memory. If not, then more work needs to be done.
# aggregate_samples.py
import os
import sys
import argparse
import csv
def main(options):
columns = []
try:
# Load in aggregate.csv, if there is one.
with open('aggregate.csv') as f:
column = [line.rstrip('\n') for line in f]
columns.append(column)
except FileNotFoundError:
# Doesn't exist; create it later.
pass
longest_sample = 0
for d, subdirs, files in os.walk(options.directory):
subdirs.sort()
for filename in files:
if filename == 'sample.csv':
file_path = os.path.join(d, filename)
print(file_path)
samples = []
with open(file_path) as f:
reader = csv.reader(f, delimiter=',')
# Get the 2nd column.
for sample in reader:
samples.append(sample[1])
longest_sample = max(longest_sample, len(samples))
columns.append(samples)
# Pre-fill a transpose matrix according to number of columns
# and longest colum.
a = [ [ '' for i in columns ] for j in range(longest_sample) ]
# Move samples into matrix, transposing as you go.
for i in range(len(columns)):
for j in range(len(columns[i])):
a[j][i] = columns[i][j]
# Output matrix as CSV.
with open('aggregate.csv', 'w+') as aggregate:
writer = csv.writer(aggregate, delimiter=',')
writer.writerows(a)
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'directory',
help='Directory path.')
options = parser.parse_args()
sys.exit(main(options))
You can get all the info you need about the script package here.
Here is how to set the Program argument, using Packages->Script->Configure Script:

Splitting CSV file by row

Someone, please review this code for me. I am kind of confused with the path in the code. By the way, the code is for splitting CSV file based on a number of rows, got it on GitHub and by using it have been trying to split a CSV file but the code is too confusing for me.
You may also follow the link for the code click to see the code
Assuming that, the name of the csv to be splitted is Dominant.csv,
source_filepath is C:\\Users\James\\Desktop\\Work,
dest_path is C:\\Users\James\\Desktop\\Work\\Processed,
result_filename_prefix is split,
My confusion is,
Is target_filename in the code means my csv file Dominant.csv? and what exactly is this target_filepath?
Could someone please reformat the code for me as per the given path and file names? Would be really thankful
import csv
import os
import sys
if len(sys.argv) != 5:
raise Exception('Wrong number of arguments!')
SOURCE_FILEPATH = sys.argv[1]
DEST_PATH = sys.argv[2]
FILENAME_PREFIX = sys.argv[3]
ROW_LIMIT = int(sys.argv[4])
def split_csv(source_filepath, dest_path, result_filename_prefix, row_limit):
"""
Split a source CSV into multiple CSVs of equal numbers of records,
except the last file.
The initial file's header row will be included as a header row in each split
file.
Split files follow a zero-index sequential naming convention like so:
`{result_filename_prefix}_0.csv`
:param source_filepath {str}:
File name (including full path) for the file to be split.
:param dest_path {str}:
Full path to the directory where the split files should be saved.
:param result_filename_prefix {str}:
File name to be used for the generated files.
Example: If `my_split_file` is provided as the prefix, then a resulting
file might be named: `my_split_file_0.csv'
:param row_limit {int}:
Number of rows per file (header row is excluded from the row count).
:return {NoneType}:
"""
if row_limit <= 0:
raise Exception('row_limit must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_number = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{result_filename_prefix}_{file_number}.csv'
target_filepath = os.path.join(dest_path, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < row_limit:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_number += 1
split_csv(SOURCE_FILEPATH, DEST_PATH, FILENAME_PREFIX, ROW_LIMIT)
target_filename is the name you want the output file to have.
target_filepath is the path to the output file including its name.
In the split_csv function call:
SOURCE_PATH is the path to the source file
DEST_PATH is the path to the folder you want the output file in
FILENAME_PREFIX is what you want the output file name(s) to start with
ROW_LIMIT is the maximum number of rows per file you want written to the output file.

Searching multiple text files for two strings?

I have a folder with many text files (EPA10.txt, EPA55.txt, EPA120.txt..., EPA150.txt). I have 2 strings that are to be searched in each file and the result of the search is written in a text file result.txt. So far I have it working for a single file. Here is the working code:
if 'LZY_201_335_R10A01' and 'LZY_201_186_R5U01' in open('C:\\Temp\\lamip\\EPA150.txt').read():
with open("C:\\Temp\\lamip\\result.txt", "w") as f:
f.write('Current MW in node is EPA150')
else:
with open("C:\\Temp\\lamip\\result.txt", "w") as f:
f.write('NOT EPA150')
Now I want this to be repeated for all the text files in the folder. Please help.
Given that you have some amount of files named from EPA1.txt to EPA150.txt, but you don't know all the names, you can put them all together inside a folder, then read all the files in that folder using the os.listdir() method to get a list of filenames. You can read the file names using listdir("C:/Temp/lamip").
Also, your if statement is wrong, you should do this instead:
text = file.read()
if "string1" in text and "string2" in text
Here's the code:
from os import listdir
with open("C:/Temp/lamip/result.txt", "w") as f:
for filename in listdir("C:/Temp/lamip"):
with open('C:/Temp/lamip/' + filename) as currentFile:
text = currentFile.read()
if ('LZY_201_335_R10A01' in text) and ('LZY_201_186_R5U01' in text):
f.write('Current MW in node is ' + filename[:-4] + '\n')
else:
f.write('NOT ' + filename[:-4] + '\n')
PS: You can use / instead of \\ in your paths, Python automatically converts them for you.
Modularise! Modularise!
Well, not in the terms of having to write distinct Python modules, but isolate the different tasks at hand.
Find the files you wish to search.
Read the file and locate the text.
Write the result into a separate file.
Each of these tasks can be solved independently. I.e. to list the files, you have os.listdir which you might want to filter.
For step 2, it does not matter whether you have 1 or 1,000 files to search. The routine is the same. You merely have to iterate over each file found in step 1. This indicates that step 2 could be implemented as a function that takes the filename (and possible search-string) as argument, and returns True or False.
Step 3 is the combination of each element from step 1 and the result of step 2.
The result:
files = [fn for fn in os.listdir('C:/Temp/lamip') if fn.endswith('.txt')]
# perhaps filter `files`
def does_fn_contain_string(filename):
with open('C:/Temp/lamip/' + filename) as blargh:
content = blargh.read()
return 'string1' in content and/or 'string2' in content
with open('results.txt', 'w') as output:
for fn in files:
if does_fn_contain_string(fn):
output.write('Current MW in node is {1}\n'.format(fn[:-4]))
else:
output.write('NOT {1}\n'.format(fn[:-4]))
You can do this by creating a for loop that runs through all your .txt files in the current working directory.
import os
with open("result.txt", "w") as resultfile:
for result in [txt for txt in os.listdir(os.getcwd()) if txt.endswith(".txt")]:
if 'LZY_201_335_R10A01' and 'LZY_201_186_R5U01' in open(result).read():
resultfile.write('Current MW in node is {1}'.format(result[:-4]))
else:
resultfile.write('NOT {0}'.format(result[:-4]))

Categories