A loop to extract URLS from several text files - python

I am attempting to extract a list of URLS from several files using a for loop, however this is resulting in a list of URLS from only the first file, repeated 10 times. I'm not sure what I am doing wrong. Also, I am an absolute beginner at this, so I will presume that there are much better ways of trying to achieve what I want, however this is what I have so far.
type_urls = []
y = 0
for files in cwk_dir:
while y < 10:
open('./cwkfiles/cwkfile{}.crawler.idx'.format(y))
lines = r.text.splitlines()
header_loc = 7
name_loc = lines[header_loc].find('Company Name')
type_loc = lines[header_loc].find('Form Type')
cik_loc = lines[header_loc].find('CIK')
filedate_loc = lines[header_loc].find('Date Filed')
url_loc = lines[header_loc].find('URL')
firstdata_loc = 9
for line in lines[firstdata_loc:]:
company_name = line[:type_loc].strip()
form_type = line[type_loc:cik_loc].strip()
cik = line[cik_loc:filedate_loc].strip()
file_date = line[filedate_loc:url_loc].strip()
page_url = line[url_loc:].strip()
typeandurl = (form_type, page_url)
type_urls.append(typeandurl)
y = y + 1

Here is a more Pythonic way using pathlib and Python 3:
from pathlib import Path
cwk_dir = Path('./cwkfiles')
type_urls = []
header_loc = 7
firstdata_loc = 9
for cwkfile in cwk_dir.glob('cwkfile*.crawler.idx'):
with cwkfile.open() as f:
lines = f.readlines()
name_loc = lines[header_loc].find('Company Name')
type_loc = lines[header_loc].find('Form Type')
cik_loc = lines[header_loc].find('CIK')
filedate_loc = lines[header_loc].find('Date Filed')
url_loc = lines[header_loc].find('URL')
for line in lines[firstdata_loc:]:
company_name = line[:type_loc].strip()
form_type = line[type_loc:cik_loc].strip()
cik = line[cik_loc:filedate_loc].strip()
file_date = line[filedate_loc:url_loc].strip()
page_url = line[url_loc:].strip()
type_urls.append((form_type, page_url))
If you want to test on a small batch of files, replace cwk_dir.glob('cwkfile*.crawler.idx') with cwk_dir.glob('cwkfile[0-9].crawler.idx'). That will give you the first then files if they are sequentially numbered, starting from 0.
And here is better way to put it all together and in a more readable way:
from pathlib import Path
def get_offsets(header):
return dict(
company_name = header.find('Company Name'),
form_type = header.find('Form Type'),
cik = header.find('CIK'),
file_date = header.find('Date Filed'),
page_url = header.find('URL')
)
def get_data(line, offsets):
return dict(
company_name = line[:offsets['form_type']].strip(),
form_type = line[offsets['form_type']:offsets['cik']].strip(),
cik = line[offsets['cik']:offsets['file_date']].strip(),
file_date = line[offsets['file_date']:offsets['page_url']].strip(),
page_url = line[offsets['page_url']:].strip()
)
cwk_dir = Path('./cwkfiles')
types_and_urls = []
header_line = 7
first_data_line = 9
for cwkfile in cwk_dir.glob('cwkfile*.crawler.idx'):
with cwkfile.open() as f:
lines = f.readlines()
offsets = get_offsets(lines[header_line])
for line in lines[first_data_line:]:
data = get_data(line, offsets)
types_and_urls.append((data['form_type'], data['page_url']))

When you get to the second file, the while condition fails as y is already 10.
Try setting y back to 0 just before the while loop:
for files in cwk_dir:
y = 0
while y < 10:
...
And as you're opening the file in the first line inside the while loop, you probably need to close it when exiting the loop.

Related

List inside dictionary

I am unable to get the last 3 digits of the id number.
from datetime import datetime
def days_to_birthday(date):
datetime_object = datetime.strptime(date, "%Y-%m-%d")
date = datetime_object.date()
num_days = date.timetuple().tm_yday
return num_days
fo = open("Data.txt", 'r') # File containg data
content = [i.rsplit() for i in fo.readlines()]
names = [content[i][0] for i in range(len(content))]
dates = [content[i][1] for i in range(len(content))]
gender = [content[i][2] for i in range(len(content))]
id_numbers = []
mydict = dict(zip(dates, gender))
for i in mydict:
x = days_to_birthday(i)
if mydict.get(i) == "F":x += 500
x = str(x)
if len(x) < 3:x = x.zfill(3)
i = i.split('-')
out = i[0] + x
id_numbers.append(out)
for i in range(len(names)):
print(f"{names[i]} {id_numbers[i]}" )
Running your code would raise SyntaxError: 'return' outside function.
Because Python is a whitespace sensitive language, return num_days must be further indented so it applies within the days_to_birthday function.

How to do motif search using python?

I am trying to check for the nrf2 binding motif using regular expression with python. I have done that with R using JASPAR2018 PWM, but due to few issues with JASPAR.
I wish to redo it using python.
Attempt
from Bio import SeqIO
from itertools import islice
import pandas as pd
#Creating Reverese Complements
def reverseComp(Seq):
seq = Seq.upper()
d = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
seq = seq[::-1]
rc_seq = "".join([d[nuc] for nuc in seq])
except KeyError:
return "Not Viable DNA Seq"
return rc_seq
def genSeq(genome_path, chrom, chromstart, chromend):
if bool(re.search('gz', genome_path)) | bool(re.search('fa', genome_path)) | bool(re.search('fasta', genome_path)):
if bool(re.search('gz', genome_path)) == True:
genome = SeqIO.parse(gzip.open(genome_path, 'rt'),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom) , None))
seq = str(seq_gen.seq[chromstart:chromend])
else:
genome = SeqIO.parse(open(genome_path),'fasta')
identifiers = [seq_record.id for seq_record in genome]
seq_gen = next(islice(genome, identifiers.index(chrom)+1 , None))
seq = str(seq_gen.seq[chromstart:chromend])
elif bool(re.search('2bit', genome_path)):
tbGenome = tbr.TwoBitFile(genome_path)
seq = tbGenome[chrom][chromstart:chromend]
else:
raise Exception('File type not recognized')
return (seq).upper()
pat = "[AGC]TGA[CTG][ATCG][CAT][AGT]GC[ATCG]"
pattern = re.compile(pat)
motifDF = []
motifQuant = []
with open('/Users/kalyanidhusia/Desktop/nrf2_R/ENCFF126HBJ.bed') as f:
for line in f:
peak = list(line.split())
seq = genSeq('hg19.fa', peak[0], int(peak[1]), int(peak[2]))
rSeq = reverseComp(seq)
sequences = []
for result in re.finditer(pattern, seq):
sequences.append("".join(result.groups()))
for result in re.finditer(pattern, rSeq):
sequences.append("".join(result.groups()))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0], 'chromstart':peak[1], 'chromend':peak[2]})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
Error
This is the error I am getting:
ipython-input-3-2e7ebdf92205> in genSeq(genome_path, chrom,
chromstart, chromend) 25 identifiers = [seq_record.id for seq_record
in genome] ---> 26 seq_gen = next(islice(genome,
identifiers.index(chrom)+1 , None)) 27 seq =
str(seq_gen.seq[chromstart:chromend]) 28 elif bool(re.search('2bit',
genome_path)): StopIteration:
How do I solve this problem?
To the above problem, I was able to solve it by tweaking with my code a little. Here is the solved example for you guys and my problem with the code below:
motif = '[REGULAR_EXPRESSION_FOR_YOUR_MOTIF]'
regBS = re.compile(motif)
motifDF = []
motifQuant = []
genome = tbr.TwoBitFile('/Path_to_your_genomefile_in_2bit.2bit/')
with open('/Path_to_your.bedfile/') as f:
for line in f:
if line.startswith('track') == False:
peak = list(line.split())
seq = (genome[peak[0]][int(peak[1]):int(peak[2])]).upper()
rSeq = reverseComp(seq)
sequences = []
sequences.extend(re.findall(regBS, seq))
sequences.extend(re.findall(regBS, rSeq))
if len(sequences) > 0:
seqs = pd.DataFrame({'binding':sequences, 'chrom':peak[0],'chromstart':peak[1], 'chromend':peak[2], 'NR':'NRF2'})
motifDF.append(seqs)
motifQuant.append([peak[0], peak[1], peak[2], len(seqs), len(seq)])
search_reg = pd.concat(motifDF)
names = ['chrom', 'chromstart', 'chromend', 'numOfMatches', 'lenSeq']
dist_reg = pd.DataFrame(motifQuant, columns=names)
dist_reg.head()
n = 5
x = [len(i[6+n:-6-n]) for i in search_reg['binding']]
This code generates the peak sequences that I want and store it in search_reg[binding] but it also stores a space seperated number with it. I need to store them in two different columns. Any suggestions?

Python for loop repeats

def get_user_data(self,start_url):
html = requests.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
contents = selector.xpath('//span[#class="ctt"]/text()')
times = selector.xpath('//span[#class="ct"]/text()')
data = {}
for each_text in contents:
data['content'] = each_text.encode().decode('utf-8').replace('\u200b','')
for each_time in times:
month_day, time, device = each_time.split(maxsplit=2)
data['mobile_phone'] = device
data['create_time'] = month_day + time
data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.mysql.insert(data)
I want to insert data into the database,But the data['content'] field is repeated,How should I modify it?
You should iterate through contents and times in parallel, instead of one after another. Try using zip.
def get_user_data(self,start_url):
html = requests.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
contents = selector.xpath('//span[#class="ctt"]/text()')
times = selector.xpath('//span[#class="ct"]/text()')
for each_text, each_time in zip(contents, times):
data = {}
data['content'] = each_text.encode().decode('utf-8').replace('\u200b','')
month_day, time, device = each_time.split(maxsplit=2)
data['mobile_phone'] = device
data['create_time'] = month_day + time
data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.mysql.insert(data)

Python - Loop inside loop of two files with regex

Im trying to create a loop in order to check two files and compare with regex if a specific field matches.
avi file
TVShowName.S01E01.W.DVDRip.XviD.avi
TVShowName.S01E02.W.DVDRip.XviD.avi
TVShowName.S01E03.W.DVDRip.XviD.avi
srt
tvShowName.S01E01.episodename.DVDRip.XviD.srt
tvShowName.S01E02.episodename.DVDRip.XviD.srt
tvShowName.S01E03.episodename.DVDRip.XviD.srt
Without a loop I can match the file and make the magic happen. Although when I use the loop it only reaches the first line.
TVShowName.S01E01.W.DVDRip.XviD.avi
TVShowName.S01E01.W.DVDRip.XviD.srt
Code:
f1 = open('avi', 'r')
f2 = open('srt', 'r')
f3 = open ('merge', 'a')
for avi in f1:
m = re.search(".*([Ss][0-20].[eE][0-24].)+.*", avi )
for sub in f2:
n = re.search(".*([Ss][0-20].[eE][0-24].)+.*", sub )
if m.group(1) == n.group(1):
str_avi = str(m.group(0))
#print str_avi
ext_srt = str_srt.split('.')
ext_avi = str_avi.split('.')
#print ext_avi
#conv_str = str(m.group(0))
merge = str_avi.replace(ext_avi[-1],ext_srt[-1])
print merge
f3.write(merge)
f3.close()
I'm not entirely sure if this is the output you wanted. I can't add comments because I don't have enough reputation points.
import glob
import re
avifiles = []
srtfiles = []
for afile in glob.glob('*.avi'):
avifiles.append(afile)
for sfile in glob.glob('*.srt'):
srtfiles.append(sfile)
#f1 = open('avi', 'r')
#f2 = open('srt', 'r')
f3 = open ('merge', 'a')
for avi in avifiles:
m = re.search(".*([Ss][0-20].[eE][0-24].)+.*", avi )
for sub in srtfiles:
n = re.search(".*([Ss][0-20].[eE][0-24].)+.*", sub )
if m.group(1) == n.group(1):
str_avi = str(m.group(0))
str_srt = str(n.group(0))
ext_srt = str_srt.split('.')
ext_avi = str_avi.split('.')
#print ext_avi
#conv_str = str(m.group(0))
merge = str_avi.replace(ext_avi[-1],ext_srt[-1])
print merge
f3.write(merge+"\n")
f3.close()
I made the follow code and it seems working. My next step is add more video extentions. But it should be easy.
Thank you guys fot the helping!
import re, os, sys, itertools
str_avi = ''
split_avi = ''
global zzz
lista_avi = []
lista_srt = []
lista_final = []
os.chdir('.')
f1 = os.listdir(".")
for full in f1:
avi = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", full )
if avi:
if avi.group(2) == 'avi':
lista_avi.append(avi.group(0))
elif avi.group(2) == 'srt':
lista_srt.append(avi.group(0))
else:
pass
else:
print "Nenhum Arquivo localizado!"
for f,b in itertools.izip(lista_avi,lista_srt):
data_avi = f.split('.')
data_srt = b.split('.')
data_regx_avi = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", f )
data_regx_srt = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", b )
for x in lista_srt:
data_regx_srt = re.search(".*([Ss][0-9].[eE][0-9].)+.*(...$)", x )
if data_regx_avi.group(1) == data_regx_srt.group(1):
print 'Arquivo video:', data_regx_avi.group(0)
print 'Arquivo sub: ', f.replace(data_avi[-1],data_srt[-1])
#lista_final.append(f.replace(data_avi[-1],data_srt[-1]))
xx = f.replace(data_avi[-1],data_srt[-1])
os.rename(x, xx)

Geneate a random integer or to predefined int

I am working with some data that for a specific column can only formatted in 1 of three ways:
3884932039484 (this is randomly generated from my program)
0 (this is static and will never change)
-1 (this is static and will never change)
I want the program to randomly pick between option 1,2 or 3 and insert one of three options. This is what I currently have:
file = open(r'I:\PythonDataFiles\StandardFeedInput\standardfeed_test.tsv', 'r')
all_lines = file.readlines()
#date_time_answer = input('Please input a date and time(2015-09-15 00:00:00): ')
#if date_time_answer == '':
date_time_answer = '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now() - datetime.timedelta(days = 1))
date_time = 1
is_imp = 16
person_id = 19
aid = 44
line_id = 49
cid = 50
is_click = 60
app_id = 0
prev_app_id = ''
new_file = open(r'I:\PythonDataFiles\Standard Feed Output\test2.txt', 'w')
for line in all_lines:
row = line.split('\t')
row[date_time] = date_time_answer
row[person_id] = str((random.randint(1000000000, 9999999999)), 0, -1)
if row[app_id] == str(prev_app_id):
row[is_imp] = str(0)
row[is_click] = str(1)
else:
row[is_imp] = str(1)
prev_app_id = app_id
print(row)
new_file.write('\t'.join(row))
Use random.choice() to pick one of the three options:
random.choice([random.randint(1000000000, 9999999999), 0, -1])

Categories