Making columns of data lists in python

Making columns of data lists in python - python

So I have a program, that reads through a bunch of files and appends the necessary data that I need. I need to now take those particular data and show them as a list. To be more specific, these are the parameters I have:
a = Source, b = luminosity, c = luminosity error, d = HST, e = XRS, f = gmag, g = z, and h = rh
I want to display this in a list, each defining a particular column. I just don't know where exactly I should insert the print statement among the various for loops I've done to do this.
I would appreciate any help! Here's the program (the main focus is in the for loops done and how they iterate through the data, and don't worry about indentations, the program so far works I just need to display the data appended in columns):
import sys
import os
import re
import urllib
import urllib2
from os.path import basename
import urlparse
import shutil
base_dirname = '/projects/XRB_Web/apmanuel/499/'
base_sourcefile = base_dirname + 'Sources.txt'
try:
file = open(base_sourcefile, 'r')
except IOError:
print 'Cannot open: '+base_sourcefile
Source = []
Finallist = []
ACS = []
SRC = []
for line in file:
data_line_check = (line.strip())
if data_line_check:
line = re.sub(r'\s+', ' ', line)
point = line.split('|')
temp_source = (point[0]).strip()
if temp_source and len(point) == 3:
Source = (point[0]).strip()
Source = re.sub(r'\s', '_', Source)
print Source+"\n"
temp_finallist = (point[1]).strip()
if temp_finallist:
Finallistaddress = (point[1]).strip()
Finallistaddress = re.sub(r'\s', '_', Finallistaddress)
Luminositybase_dirname1 = '/projects/XRB_Web/apmanuel/499/Lists/' + Finallistaddress
try:
file2 = open(Luminositybase_dirname1, 'r')
except IOError:
print 'Cannot open: '+Luminositybase_dirname1
source = []
luminosity = []
luminosityerr = []
for line in file2:
pointy = line.split()
a = int(pointy[0])
b = float(pointy[5])
c = float(pointy[6])
source.append(a)
luminosity.append(b)
luminosityerr.append(c)
temp_HST = (point[2]).strip()
if temp_HST:
HSTaddress = (point[2]).strip()
HSTaddress = re.sub(r'\s', '_', HSTaddress)
HSTbase_dirname2 = '/projects/XRB_Web/apmanuel/499/Lists/' + HSTaddress
try:
file3 = open(HSTbase_dirname2, 'r')
except IOError:
print 'Cannot open: '+HSTbase_dirname2
HST = []
for line in file3:
pointy2 = line.split()
d = int(pointy2[0])
HST.append(d)
temp_XRS = (point[3]).strip()
if temp_XRS:
XRSaddress = (point[3]).strip()
XRSaddress =re.sub(r'\s', '_', XRSaddress)
XRSbase_dirname3 = '/projects/XRB_Web/apmanuel/499/Lists/' + XRSaddress
try:
file4 = open(XRSbase_dirname3, 'r')
except IOError:
print 'Cannot open: '+XRSbase_dirname3
XRS = []
for line in file4:
pointy3 = line.split()
e = int(pointy3[0])
XRS.append(e)
temp_others = (point[4]).strip()
if temp_others:
othersaddress = (point[4]).strip()
othersaddress =re.sub(r'\s', '_', othersaddress)
othersbase_dirname4 = '/projects/XRB_Web/apmanuel/499/Lists/' + othersaddress
try:
file5 = open(othersbase_dirname4, 'r')
except IOError:
print 'Cannot open: '+othersbase_dirname4
gmag = []
z = []
rh = []
for line in file5:
pointy4 = line.split()
f = float(pointy4[3])
g = float(pointy4[5])
h = float(pointy4[7])
rh.append(f)
gmag.append(g)
z.append(h)

this function will return columns for a list of rows. note that this requires the lists to all have an element in the column you are trying to access, though it would be relatively simple to change this if you need it.
def getcolumn(matrix,index): #index specifies which column of the matrix you want. note that like all other list indexes, this starts from 0, not one.
column = []
for row in matrix:
column.append(row[index])
return column

Related

How to do motif search using python?

I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?

To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?

Python - Loop inside loop of two files with regex

Im trying to create a loop in order to check two files and compare with regex if a specific field matches.
avi file
TVShowName.S01E01.W.DVDRip.XviD.avi
TVShowName.S01E02.W.DVDRip.XviD.avi
TVShowName.S01E03.W.DVDRip.XviD.avi
srt
tvShowName.S01E01.episodename.DVDRip.XviD.srt
tvShowName.S01E02.episodename.DVDRip.XviD.srt
tvShowName.S01E03.episodename.DVDRip.XviD.srt
Without a loop I can match the file and make the magic happen. Although when I use the loop it only reaches the first line.
TVShowName.S01E01.W.DVDRip.XviD.avi
TVShowName.S01E01.W.DVDRip.XviD.srt
Code:
f1 = open('avi', 'r')
f2 = open('srt', 'r')
f3 = open ('merge', 'a')
for avi in f1:
m = re.search(".*([Ss][0-20].[eE][0-24].)+.*", avi )
for sub in f2:
n = re.search(".*([Ss][0-20].[eE][0-24].)+.*", sub )
if m.group(1) == n.group(1):
str_avi = str(m.group(0))
#print str_avi
ext_srt = str_srt.split('.')
ext_avi = str_avi.split('.')
#print ext_avi
#conv_str = str(m.group(0))
merge = str_avi.replace(ext_avi[-1],ext_srt[-1])
print merge
f3.write(merge)
f3.close()

I'm not entirely sure if this is the output you wanted. I can't add comments because I don't have enough reputation points.
import glob
import re
avifiles = []
srtfiles = []
for afile in glob.glob('*.avi'):
avifiles.append(afile)
for sfile in glob.glob('*.srt'):
srtfiles.append(sfile)
#f1 = open('avi', 'r')
#f2 = open('srt', 'r')
f3 = open ('merge', 'a')
for avi in avifiles:
m = re.search(".*([Ss][0-20].[eE][0-24].)+.*", avi )
for sub in srtfiles:
n = re.search(".*([Ss][0-20].[eE][0-24].)+.*", sub )
if m.group(1) == n.group(1):
str_avi = str(m.group(0))
str_srt = str(n.group(0))
ext_srt = str_srt.split('.')
ext_avi = str_avi.split('.')
#print ext_avi
#conv_str = str(m.group(0))
merge = str_avi.replace(ext_avi[-1],ext_srt[-1])
print merge
f3.write(merge+"\n")
f3.close()

I made the follow code and it seems working. My next step is add more video extentions. But it should be easy.
Thank you guys fot the helping!
import re, os, sys, itertools
str_avi = ''
split_avi = ''
global zzz
lista_avi = []
lista_srt = []
lista_final = []
os.chdir('.')
f1 = os.listdir(".")
for full in f1:
avi = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", full )
if avi:
if avi.group(2) == 'avi':
lista_avi.append(avi.group(0))
elif avi.group(2) == 'srt':
lista_srt.append(avi.group(0))
else:
pass
else:
print "Nenhum Arquivo localizado!"
for f,b in itertools.izip(lista_avi,lista_srt):
data_avi = f.split('.')
data_srt = b.split('.')
data_regx_avi = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", f )
data_regx_srt = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", b )
for x in lista_srt:
data_regx_srt = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", x )
if data_regx_avi.group(1) == data_regx_srt.group(1):
print 'Arquivo video:', data_regx_avi.group(0)
print 'Arquivo sub: ', f.replace(data_avi[-1],data_srt[-1])
#lista_final.append(f.replace(data_avi[-1],data_srt[-1]))
xx = f.replace(data_avi[-1],data_srt[-1])
os.rename(x, xx)

Reading repeated information from the file in different order in Python

I tried to search for similar questions, but I couldn't find. Please mark as a duplicate if there is similar questions available.
I'm trying to figure out a way to read and gather multiple information from single file. Here in the file Block-A,B & C are repeated in random order and Block-C has more than one information to capture. Every block end with 'END' text. Here is the input file:
Block-A:
(info1)
END
Block-B:
(info2)
END
Block-C:
(info3)
(info4)
END
Block-C:
(info7)
(info8)
END
Block-A:
(info5)
END
Block-B:
(info6)
END
Here is my code:
import re
out1 = out2 = out3 = ""
a = b = c = False
array=[]
with open('test.txt', 'r') as f:
for line in f:
if line.startswith('Block-A'):
line = next(f)
out1 = line
a = True
if line.startswith('Block-B'):
line=next(f)
out2 = line
b = True
if line.startswith('Block-C'):
c = True
if c:
line=next(f)
if not line.startswith('END\n'):
out3 = line
array.append(out3.strip())
if a == b == c == True:
print(out1.rstrip() +', ' + out2.rstrip() + ', ' + str(array))
a = b = c = False
array=[]
Thank you in advance for your valuable inputs.

Use a dictionary for the datas from each block. When you read the line that starts a block, set a variable to that name, and use it as the key into the dictionary.
out = {}
with open('test.txt', 'r') as f:
for line in f:
if line.endswidth(':'):
blockname = line[:-1]
if not blockname in out:
out[blockname] = ''
elif line == 'END'
blockname = None
else if blockname:
out[blockname] += line
print(out)

If you don't want the Block-X to print, unhash the elif statment
import os
data = r'/home/x/Desktop/test'
txt = open(data, 'r')
for line in txt.readlines():
line = line[:-1]
if line in ('END'):
pass
#elif line.startswith('Block'):
# pass
else:
print line
>>>>
Block-A:
(info1)
Block-B:
(info2)
Block-C:
(info3)
(info4)
Block-C:
(info7)
(info8)
Block-A:
(info5)
Block-B:
(info6)

Sum of a particular column in a csv file

There is a csv file, say A.csv, having content:
Place,Hotel,Food,Fare
Norway,Regal,NonVeg,5000
Poland,Jenny,Italiano,6000
Norway,Suzane,Vegeterian,4000
Norway,Regal,NonVeg,5000
I have to parse this csv and obtain an output by passing arguments in command prompt.
Example 1:
mycode.py Place
Desired output is:
Place,Fare
Norway,14000
Poland,6000
Example 2:
mycode.py Place Hotel
Desired output is:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
So it is clear from the above example that no matter what you pass as argument it gives you the sum of the Fare header for the common ones.
Below is my code and I am able to pass arguments and get an output, but I am stuck in sum of Fare. Can any one help me with this.
import sys
import csv
import collections
d = collections.defaultdict(list)
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
out.setdefault(a[0],[]).append(int(a[-1]))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, dialect='excel', delimiter=',')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()
I edit the code and tried to group it. Now I am able to get the aggregate of the Fare but not the desired output.
So when I am passing:
mycode.py Place Hotel
Instead of:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
I am getting:
Place,Hotel,Fare
Norway,14000
Poland,6000

Finally i managed to get my desired output.
Below i am sharing the final code. \
import sys
import csv
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
count_val = a[-1]
del a[-1]
key_val = ','.join(a)
out.setdefault(key_val.strip('"'),[]).append(int(count_val))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',',quotechar=' ')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()

Select lines stack python

i writen this code:
import os
import re
import string
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
for lines, text in enumerate(f):
found = text.find('*SURFACE')
while found > -1:
print found, lines, text
found = text.find('*SURFACE', found + 1)
n.write(text)
##
f.close()
n.close()
This is what *.inp looks like (usually about 30Mb)
*SURFACE, NAME = BOTTOM, TYPE = ELEMENT
40012646, S2
40012647, S2
40012648, S2
40012649, S2
40012650, S2
40012651, S2
*SURFACE, NAME = ALL_INT_TIE_1, TYPE = ELEMENT
40243687, S3
40243703, S3
40243719, S3
40243735, S3
40243751, S3
40243767, S3
**
*TIE, NAME = INTERNAL_TIE, POSITION TOLERANCE = 1.0 , ADJUST=NO
SLAVE,MASTER
*TIE, NAME = SKN_REF_1
ALL_INT_FRONT, ALL_EXT_FRONT
*TIE, NAME = SKIN_LAT
ALL_INT_LAT, ALL_EXT_LAT
*TIE, NAME = SKIN_TIE_1
ALL_INT_TIE_1, ALL_INT_TIE_2
**
*SURFACE , NAME = TOP, COMBINE = UNION
TOP_1
TOP_2
**HM_UNSUPPORTED_CARDS
*END PART
*****
what he does it is clear. what I would like to achive is to get all the line between the *SURFACE that begin with a number, which then I will have to arrange differently, but I will worry about that later.
I rewrote the code cos i could not get it to work as suggested, now it is creating the blocks as I need them, but how do i work on each block?
I need to separate all the elements (number followed by S1, S2 and so on) and create groups for each block sorted by S1, S2 and so on the final result should look like
*ELSET, ELSET=TOP_S1
40221320, 40221306, 40221305, 40221304, 40221290, 40221289, 40221288, 40221274,
40221273, 40221272, 40221258, 40221257, 40221256, 40221242, 40221241, 40221240,
*SURFACE, NAME = TOP, TYPE = ELEMENT
TOP_S1,S1
import os
import re
import string
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
in_surface_block = False;
for line_num, text in enumerate(f):
found = text.find('*SURFACE')
if found > -1:
in_surface_block=True;
print found, line_num, text
surface_lines = []
continue
if in_surface_block:
m = re.match('\s*\d+\,\s*\w\d+',text)
if m:
mtext = m.group(0)
## p=surface_lines.append(text)
print mtext
## ntext = surface_lines.append(m.group(0))
## n.write(ntext)
##
f.close()
n.close()
I hope it is clear

I think this will do what you want:
import os
import re
##
Path = 'C:/RESULT/BATCH/'
##
Nfile = 'Skin_Refined_v05'
f=open(Path + Nfile + '.inp')
n=open(Path + 'newfile.inp', 'w')
in_surface_block = False;
for line_num, text in enumerate(f):
found = text.find('*SURFACE')
if found > -1:
in_surface_block=True;
print found, line_num, text
surface_lines = []
continue
if in_surface_block:
if re.match('\s*\d+', text):
surface_lines.append(text)
else:
in_surface_block = False
// do surface lines work here:
// surface_lines is a list with all the lines in a surface block
// that start with a number
...
##
f.close()
n.close()
Edit: Fixed logic error

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Making columns of data lists in python - python

Related

How to do motif search using python?

Python - Loop inside loop of two files with regex

Reading repeated information from the file in different order in Python

Sum of a particular column in a csv file

Select lines stack python

Categories

Resources