Python - Loop inside loop of two files with regex

Python - Loop inside loop of two files with regex - python

Im trying to create a loop in order to check two files and compare with regex if a specific field matches.
avi file
TVShowName.S01E01.W.DVDRip.XviD.avi
TVShowName.S01E02.W.DVDRip.XviD.avi
TVShowName.S01E03.W.DVDRip.XviD.avi
srt
tvShowName.S01E01.episodename.DVDRip.XviD.srt
tvShowName.S01E02.episodename.DVDRip.XviD.srt
tvShowName.S01E03.episodename.DVDRip.XviD.srt
Without a loop I can match the file and make the magic happen. Although when I use the loop it only reaches the first line.
TVShowName.S01E01.W.DVDRip.XviD.avi
TVShowName.S01E01.W.DVDRip.XviD.srt
Code:
f1 = open('avi', 'r')
f2 = open('srt', 'r')
f3 = open ('merge', 'a')
for avi in f1:
m = re.search(".*([Ss][0-20].[eE][0-24].)+.*", avi )
for sub in f2:
n = re.search(".*([Ss][0-20].[eE][0-24].)+.*", sub )
if m.group(1) == n.group(1):
str_avi = str(m.group(0))
#print str_avi
ext_srt = str_srt.split('.')
ext_avi = str_avi.split('.')
#print ext_avi
#conv_str = str(m.group(0))
merge = str_avi.replace(ext_avi[-1],ext_srt[-1])
print merge
f3.write(merge)
f3.close()

I'm not entirely sure if this is the output you wanted. I can't add comments because I don't have enough reputation points.
import glob
import re
avifiles = []
srtfiles = []
for afile in glob.glob('*.avi'):
avifiles.append(afile)
for sfile in glob.glob('*.srt'):
srtfiles.append(sfile)
#f1 = open('avi', 'r')
#f2 = open('srt', 'r')
f3 = open ('merge', 'a')
for avi in avifiles:
m = re.search(".*([Ss][0-20].[eE][0-24].)+.*", avi )
for sub in srtfiles:
n = re.search(".*([Ss][0-20].[eE][0-24].)+.*", sub )
if m.group(1) == n.group(1):
str_avi = str(m.group(0))
str_srt = str(n.group(0))
ext_srt = str_srt.split('.')
ext_avi = str_avi.split('.')
#print ext_avi
#conv_str = str(m.group(0))
merge = str_avi.replace(ext_avi[-1],ext_srt[-1])
print merge
f3.write(merge+"\n")
f3.close()

I made the follow code and it seems working. My next step is add more video extentions. But it should be easy.
Thank you guys fot the helping!
import re, os, sys, itertools
str_avi = ''
split_avi = ''
global zzz
lista_avi = []
lista_srt = []
lista_final = []
os.chdir('.')
f1 = os.listdir(".")
for full in f1:
avi = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", full )
if avi:
if avi.group(2) == 'avi':
lista_avi.append(avi.group(0))
elif avi.group(2) == 'srt':
lista_srt.append(avi.group(0))
else:
pass
else:
print "Nenhum Arquivo localizado!"
for f,b in itertools.izip(lista_avi,lista_srt):
data_avi = f.split('.')
data_srt = b.split('.')
data_regx_avi = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", f )
data_regx_srt = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", b )
for x in lista_srt:
data_regx_srt = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", x )
if data_regx_avi.group(1) == data_regx_srt.group(1):
print 'Arquivo video:', data_regx_avi.group(0)
print 'Arquivo sub: ', f.replace(data_avi[-1],data_srt[-1])
#lista_final.append(f.replace(data_avi[-1],data_srt[-1]))
xx = f.replace(data_avi[-1],data_srt[-1])
os.rename(x, xx)

Related

How to do motif search using python?

I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?

To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?

Reading repeated information from the file in different order in Python

I tried to search for similar questions, but I couldn't find. Please mark as a duplicate if there is similar questions available.
I'm trying to figure out a way to read and gather multiple information from single file. Here in the file Block-A,B & C are repeated in random order and Block-C has more than one information to capture. Every block end with 'END' text. Here is the input file:
Block-A:
(info1)
END
Block-B:
(info2)
END
Block-C:
(info3)
(info4)
END
Block-C:
(info7)
(info8)
END
Block-A:
(info5)
END
Block-B:
(info6)
END
Here is my code:
import re
out1 = out2 = out3 = ""
a = b = c = False
array=[]
with open('test.txt', 'r') as f:
for line in f:
if line.startswith('Block-A'):
line = next(f)
out1 = line
a = True
if line.startswith('Block-B'):
line=next(f)
out2 = line
b = True
if line.startswith('Block-C'):
c = True
if c:
line=next(f)
if not line.startswith('END\n'):
out3 = line
array.append(out3.strip())
if a == b == c == True:
print(out1.rstrip() +', ' + out2.rstrip() + ', ' + str(array))
a = b = c = False
array=[]
Thank you in advance for your valuable inputs.

Use a dictionary for the datas from each block. When you read the line that starts a block, set a variable to that name, and use it as the key into the dictionary.
out = {}
with open('test.txt', 'r') as f:
for line in f:
if line.endswidth(':'):
blockname = line[:-1]
if not blockname in out:
out[blockname] = ''
elif line == 'END'
blockname = None
else if blockname:
out[blockname] += line
print(out)

If you don't want the Block-X to print, unhash the elif statment
import os
data = r'/home/x/Desktop/test'
txt = open(data, 'r')
for line in txt.readlines():
line = line[:-1]
if line in ('END'):
pass
#elif line.startswith('Block'):
# pass
else:
print line
>>>>
Block-A:
(info1)
Block-B:
(info2)
Block-C:
(info3)
(info4)
Block-C:
(info7)
(info8)
Block-A:
(info5)
Block-B:
(info6)

Extracting data from a text file to an output file

I have alot of files which names are just number. (Starting from 1 to whatever is the maximum number) and each of these files are similar to each other by their "tags" (ObjectID =, X =, Y =, etc.), but the values after those tags are not the same at all.
I wanted to make my job easier from manually copy/pasting the data from one file to another and made a small script using Python (since I am slightly experienced in it).
This is the full script:
import os
BASE_DIRECTORY = 'C:\Users\Tom\Desktop\TheServer\scriptfiles\Objects'
output_file = open('output.txt', 'w')
output = {}
file_list = []
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'ObjectID =' in line:
output[f].append(line)
elif 'X =' in line:
output[f].append(line)
elif 'Y =' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
for row in output[tab]:
output_file.write(row + '')
Now, everything is working fine, the output file looks like this:
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
etc.
But I don't want it to be like that (each value has a new line). I need it to do this for every file:
Read the file.
Remove the tags infront of the values.
Format a single line which will have those values in the output folder. (Let's say I want to make it look like this: "(1216,-1480.500610,2522.842285)" )
Write that line in the output folder.
Repeat for every file.
Any help please?

Hope this helps.
data = open('sam.txt', 'r').read()
>>> print data
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
>>>
Now lets do some string replacements :)
>>> data = data.replace('ObjectID =', '').replace('\nX = ', ',').replace('\nY = ', ',')
>>> print data
1216,-1480.500610,2610.885742
970,-1517.210693,2522.842285
3802,-1512.156616,2521.116210

In your loop, keep track of whether you are 'in' a record:
records = []
in_record = False
id, x, y = 0, 0, 0
for line in txtfile:
if not in_record:
if 'ObjectID =' in line:
in_record = True
id = line[10:]
elif 'X =' in line:
x = line[3:]
elif 'Y =' in line:
y = line[3:]
records.append((id, x, y))
in_record = False
Then you'll have a list of tuples which you can easily write with the csv module.

Find here a version of the loop you have generating the contents.
I rewrote it so the line contents ObjectId, X and Y are in the same line.
It looks that is what you want to do:
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
myline = ''
if 'ObjectID =' in line:
pos = line.rfind("ObjectID =") + len("ObjectID =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'X =' in line:
pos = line.rfind("X =") + len("X =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'Y =' in line:
pos = line.rfind("Y =") + len("Y =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
output[f].append(myline)
Note that you need to know which character (in the code the delimiter) separates the names you try to find: ObjectID = from the actual values you want to grab from the line.

Here is what you need. I did not have enough time to write the code for appending the result to a new file. Instead it just prints it, but you get the point.
import os.path
path = "path"
#getting the number of files in your folder
num_files = len([f for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f))])
#function that returns your desired output for a given file
def file_head_ext(file_path, file_num):
with open(file_path + "/" + file_num) as myfile:
head = [next(myfile).split("=") for x in range(3)]
formatted_head = [elm[1].replace("\n",'').replace(" ","") for elm in head]
return(",".join(formatted_head))
for filnum in range(1,num_files):
print(file_head_ext(path, str(filnum)))

Making columns of data lists in python

So I have a program, that reads through a bunch of files and appends the necessary data that I need. I need to now take those particular data and show them as a list. To be more specific, these are the parameters I have:
a = Source, b = luminosity, c = luminosity error, d = HST, e = XRS, f = gmag, g = z, and h = rh
I want to display this in a list, each defining a particular column. I just don't know where exactly I should insert the print statement among the various for loops I've done to do this.
I would appreciate any help! Here's the program (the main focus is in the for loops done and how they iterate through the data, and don't worry about indentations, the program so far works I just need to display the data appended in columns):
import sys
import os
import re
import urllib
import urllib2
from os.path import basename
import urlparse
import shutil
base_dirname = '/projects/XRB_Web/apmanuel/499/'
base_sourcefile = base_dirname + 'Sources.txt'
try:
file = open(base_sourcefile, 'r')
except IOError:
print 'Cannot open: '+base_sourcefile
Source = []
Finallist = []
ACS = []
SRC = []
for line in file:
data_line_check = (line.strip())
if data_line_check:
line = re.sub(r'\s+', ' ', line)
point = line.split('|')
temp_source = (point[0]).strip()
if temp_source and len(point) == 3:
Source = (point[0]).strip()
Source = re.sub(r'\s', '_', Source)
print Source+"\n"
temp_finallist = (point[1]).strip()
if temp_finallist:
Finallistaddress = (point[1]).strip()
Finallistaddress = re.sub(r'\s', '_', Finallistaddress)
Luminositybase_dirname1 = '/projects/XRB_Web/apmanuel/499/Lists/' + Finallistaddress
try:
file2 = open(Luminositybase_dirname1, 'r')
except IOError:
print 'Cannot open: '+Luminositybase_dirname1
source = []
luminosity = []
luminosityerr = []
for line in file2:
pointy = line.split()
a = int(pointy[0])
b = float(pointy[5])
c = float(pointy[6])
source.append(a)
luminosity.append(b)
luminosityerr.append(c)
temp_HST = (point[2]).strip()
if temp_HST:
HSTaddress = (point[2]).strip()
HSTaddress = re.sub(r'\s', '_', HSTaddress)
HSTbase_dirname2 = '/projects/XRB_Web/apmanuel/499/Lists/' + HSTaddress
try:
file3 = open(HSTbase_dirname2, 'r')
except IOError:
print 'Cannot open: '+HSTbase_dirname2
HST = []
for line in file3:
pointy2 = line.split()
d = int(pointy2[0])
HST.append(d)
temp_XRS = (point[3]).strip()
if temp_XRS:
XRSaddress = (point[3]).strip()
XRSaddress =re.sub(r'\s', '_', XRSaddress)
XRSbase_dirname3 = '/projects/XRB_Web/apmanuel/499/Lists/' + XRSaddress
try:
file4 = open(XRSbase_dirname3, 'r')
except IOError:
print 'Cannot open: '+XRSbase_dirname3
XRS = []
for line in file4:
pointy3 = line.split()
e = int(pointy3[0])
XRS.append(e)
temp_others = (point[4]).strip()
if temp_others:
othersaddress = (point[4]).strip()
othersaddress =re.sub(r'\s', '_', othersaddress)
othersbase_dirname4 = '/projects/XRB_Web/apmanuel/499/Lists/' + othersaddress
try:
file5 = open(othersbase_dirname4, 'r')
except IOError:
print 'Cannot open: '+othersbase_dirname4
gmag = []
z = []
rh = []
for line in file5:
pointy4 = line.split()
f = float(pointy4[3])
g = float(pointy4[5])
h = float(pointy4[7])
rh.append(f)
gmag.append(g)
z.append(h)

this function will return columns for a list of rows. note that this requires the lists to all have an element in the column you are trying to access, though it would be relatively simple to change this if you need it.
def getcolumn(matrix,index): #index specifies which column of the matrix you want. note that like all other list indexes, this starts from 0, not one.
column = []
for row in matrix:
column.append(row[index])
return column

Select lines stack python

i writen this code:
import os
import re
import string
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
for lines, text in enumerate(f):
found = text.find('*SURFACE')
while found > -1:
print found, lines, text
found = text.find('*SURFACE', found + 1)
n.write(text)
##
f.close()
n.close()
This is what *.inp looks like (usually about 30Mb)
*SURFACE, NAME = BOTTOM, TYPE = ELEMENT
40012646, S2
40012647, S2
40012648, S2
40012649, S2
40012650, S2
40012651, S2
*SURFACE, NAME = ALL_INT_TIE_1, TYPE = ELEMENT
40243687, S3
40243703, S3
40243719, S3
40243735, S3
40243751, S3
40243767, S3
**
*TIE, NAME = INTERNAL_TIE, POSITION TOLERANCE = 1.0 , ADJUST=NO
SLAVE,MASTER
*TIE, NAME = SKN_REF_1
ALL_INT_FRONT, ALL_EXT_FRONT
*TIE, NAME = SKIN_LAT
ALL_INT_LAT, ALL_EXT_LAT
*TIE, NAME = SKIN_TIE_1
ALL_INT_TIE_1, ALL_INT_TIE_2
**
*SURFACE , NAME = TOP, COMBINE = UNION
TOP_1
TOP_2
**HM_UNSUPPORTED_CARDS
*END PART
*****
what he does it is clear. what I would like to achive is to get all the line between the *SURFACE that begin with a number, which then I will have to arrange differently, but I will worry about that later.
I rewrote the code cos i could not get it to work as suggested, now it is creating the blocks as I need them, but how do i work on each block?
I need to separate all the elements (number followed by S1, S2 and so on) and create groups for each block sorted by S1, S2 and so on the final result should look like
*ELSET, ELSET=TOP_S1
40221320, 40221306, 40221305, 40221304, 40221290, 40221289, 40221288, 40221274,
40221273, 40221272, 40221258, 40221257, 40221256, 40221242, 40221241, 40221240,
*SURFACE, NAME = TOP, TYPE = ELEMENT
TOP_S1,S1
import os
import re
import string
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
in_surface_block = False;
for line_num, text in enumerate(f):
found = text.find('*SURFACE')
if found > -1:
in_surface_block=True;
print found, line_num, text
surface_lines = []
continue
if in_surface_block:
m = re.match('\s*\d+\,\s*\w\d+',text)
if m:
mtext = m.group(0)
## p=surface_lines.append(text)
print mtext
## ntext = surface_lines.append(m.group(0))
## n.write(ntext)
##
f.close()
n.close()
I hope it is clear

I think this will do what you want:
import os
import re
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
in_surface_block = False;
for line_num, text in enumerate(f):
found = text.find('*SURFACE')
if found > -1:
in_surface_block=True;
print found, line_num, text
surface_lines = []
continue
if in_surface_block:
if re.match('\s*\d+', text):
surface_lines.append(text)
else:
in_surface_block = False
// do surface lines work here:
// surface_lines is a list with all the lines in a surface block
// that start with a number
...
##
f.close()
n.close()
Edit: Fixed logic error

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - Loop inside loop of two files with regex - python

Related

How to do motif search using python?

Reading repeated information from the file in different order in Python

Extracting data from a text file to an output file

Making columns of data lists in python

Select lines stack python

Categories

Resources