I am unable to get the last 3 digits of the id number.
from datetime import datetime
def days_to_birthday(date):
datetime_object = datetime.strptime(date, "%Y-%m-%d")
date = datetime_object.date()
num_days = date.timetuple().tm_yday
return num_days
fo = open("Data.txt", 'r') # File containg data
content = [i.rsplit() for i in fo.readlines()]
names = [content[i][0] for i in range(len(content))]
dates = [content[i][1] for i in range(len(content))]
gender = [content[i][2] for i in range(len(content))]
id_numbers = []
mydict = dict(zip(dates, gender))
for i in mydict:
x = days_to_birthday(i)
if mydict.get(i) == "F":x += 500
x = str(x)
if len(x) < 3:x = x.zfill(3)
i = i.split('-')
out = i[0] + x
id_numbers.append(out)
for i in range(len(names)):
print(f"{names[i]} {id_numbers[i]}" )
Running your code would raise SyntaxError: 'return' outside function.
Because Python is a whitespace sensitive language, return num_days must be further indented so it applies within the days_to_birthday function.
Related
I am a beginner. I have a log file like the below for ~ 1000 cycle loops,
"CycleSTART#1
Temp=26C
Fan=3000
CycleSTART#2
Temp=27C
Fan=3200
.
.
.
."
My objective is to read the Temp & fan values corresponding to cycle count. Basically I want to put everything in a table. And I tried with the simple programming
string1 = 'CycleSTART#'
string2 = 'Temp'
string3 = 'Fan'
import pandas as pd
filepath = "XXX<location of txt file>"
with open(filepath) as fp:
line=fp.readline()
cnt = 1
while line:
cnt += 1
flag = 0
index = 0
for line in fp:
if string1 in line:
flag = 1
break
if flag == 1:
lword=len(line)
extracted_string1 = line[11:11+lword]
for line in fp:
if string2 in line:
flag = 2
break
if flag == 2:
lword=len(line)
extracted_string2 = line[6:6+lword]
for line in fp:
if string3 in line:
flag = 3
break
if flag == 3:
lword=len(line)
extracted_string3 = line[5:5+lword]
data = {'cycle': [extracted_string1],
'temp' : [extracted_string2],
'Fan' : [extracted_string3],
df = pd.DataFrame(data, columns = ['cycle', 'temp', 'Fan']
print (df)
f.close()
Tried with this but every time, I get the first cycle value and its not looping to the next cycles.
I would rewrite a little bit and use splits to stay away from regex.
import pandas as pd
def add_to_dict(item, key, dic):
if dic.get(item, False):
dic[item].append(item)
else:
dic[item] = [item]
filepath = "XXX<location of txt file>"
data_container = {}
with open(filepath, "r") as f:
for indx, line in enumerate(f):
if indx % 3 == 0:
value = int(line.split("#")[-1]) # Split on # and then convert to number
key = "cycle"
elif indx % 3 == 1:
temp = float(temp.split("=")[-1][:-1]) # Split on = and then remove the C to hold the temperature value
key = "temp"
elif indx % 3 == 2:
fan = int(fan.split("=")[-1]) # Almost same as above, split on = to get the numerical value at the end and convert it to int
key = "Fan"
add_to_dict(value, key, data_container)
dataframe = pd.DataFrame.from_dict(data_container)
I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?
To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?
I have written a function to read files with specific wildcard and path from (like this one for example) previous seven days.
def i_get_last_week_file(self, par_path, par_file_wildcard, par_datetime):
proc_datetime = par_datetime - timedelta(weeks=1)
logger.info('Processing time: %s', str(proc_datetime))
# file_list = []
while proc_datetime <= par_datetime:
sdate = proc_datetime.strftime('%Y_%m_%d-%H')
for p_file in os.listdir(par_path):
if fnmatch.fnmatch(p_file, par_file_wildcard + sdate + '*.csv'):
self.files_list.append(p_file)
break
return self.files_list
def get_csv_from_local_weekly(self, par_path, par_wildcard_name, par_date=None, par_time=None):
if par_date is None:
par_date = self.file_date
if par_time is None:
par_time = self.file_time
end_datetime = datetime.combine(par_date, par_time)
a = self.i_get_last_day_file(par_path, par_wildcard_name, end_datetime)
for i in a:
try:
df = pd.read_csv(par_path+''+i, index_col=None, header=0, delimiter=';')
self.pandas_list.append(df)
except Exception:
frame = pd.concat(self.pandas_list)
self.files_list = []
self.pandas_list = []
return frame
The problem is, read only the files that are exactly seven days old, but I need to collect all files from previous 7 days from today.
Well you need to create a range of 7 days and this is one way of doing it:
import datetime
import re
td = datetime.datetime.today()
lastWeek = [(td - datetime.timedelta(i)).strftime('%Y_%m_%d-%H') for i in range(7)]
for p_file in os.listdir(par_path):
dateSearch = re.search('\d{4}_\d{2}_\d{2}-\d{2}', pfile)
if dateSearch:
dateFound = dateSearch.group(0)
if dateFound in lastWeek:
...
Of course, you can change the td into your date variable, I just used it for testing.
Also, your proc_datetime doesn't seem to increment inside the while loop, and if you introduce the increment - you won't need the break:
while proc_datetime <= par_datetime:
sdate = proc_datetime.strftime('%Y_%m_%d-%H')
for p_file in os.listdir(par_path):
if fnmatch.fnmatch(p_file, par_file_wildcard + sdate + '*.csv'):
self.files_list.append(p_file)
proc_dateimte += datetime.timedelta(days=1)
I am working with some data that for a specific column can only formatted in 1 of three ways:
3884932039484 (this is randomly generated from my program)
0 (this is static and will never change)
-1 (this is static and will never change)
I want the program to randomly pick between option 1,2 or 3 and insert one of three options. This is what I currently have:
file = open(r'I:\PythonDataFiles\StandardFeedInput\standardfeed_test.tsv', 'r')
all_lines = file.readlines()
#date_time_answer = input('Please input a date and time(2015-09-15 00:00:00): ')
#if date_time_answer == '':
date_time_answer = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now() - datetime.timedelta(days = 1))
date_time = 1
is_imp = 16
person_id = 19
aid = 44
line_id = 49
cid = 50
is_click = 60
app_id = 0
prev_app_id = ''
new_file = open(r'I:\PythonDataFiles\Standard Feed Output\test2.txt', 'w')
for line in all_lines:
row = line.split('\t')
row[date_time] = date_time_answer
row[person_id] = str((random.randint(1000000000, 9999999999)), 0, -1)
if row[app_id] == str(prev_app_id):
row[is_imp] = str(0)
row[is_click] = str(1)
else:
row[is_imp] = str(1)
prev_app_id = app_id
print(row)
new_file.write('\t'.join(row))
Use random.choice() to pick one of the three options:
random.choice([random.randint(1000000000, 9999999999), 0, -1])
I have a problem. I'm trying to print a serie of lists in python to have it with a vertical align. My code is:
def show():
book = "data.txt"
f = open(book,'r')
line = f.readlines()
f.close()
x=0
z = ''
l = []
x = []
i = 0
starting = '{:>4} {:>15} {:>15}'.format('Name', "Gender", "Year")
print(starting)
for p in line:
p = p.replace(',',' ')
x = p.index(' ')
name = p[0:x]
a = p.index('e 1')
gender = p[x:a+1]
year = p[(a+2):]
if len(name) == 3:
line_new = '{:>2} {:>15} {:>15}'.format(name, gender, year)
else:
line_new = '{:>5} {:>15} {:>15}'.format(name, gender, year)
print(line_new)
The problem is that I'm trying to have something like:
I want to put all the names of the left (and I don't have problems) then, under Gender, I want to create an equal list of Genders all on the same vertical and same thing for year
Untested, but try this:
import itertools
with open("data.txt") as data:
pep = [line.strip().split(',') for line in data]
widths = [len(max(r, key=len)) for r in itertools.izip_longest(*pep, fillvalue="")]
print "%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%("Name", "Gender", "Year")
print "\n".join(["%-{0}%s%-{1}%s%-{2}%s".format(widths[0], widths[1], widths[2])\
%(attr[0], attr[1], attr[2]) for attr in pep])