Appending a wordlist to URLS to make a webrequest - python

-I have a wordlist where each entry is printed on a separate line in a .txt file.
-I am adding the wordslists entries onto the end of a url (nsip is listed below as a placeholder)
I am trying to take each URL and and make web requests BUT when I print i.e. full_url[0] it just gives me the whole the whole wordlist appended to the url. When I use type it tells me that full_url is a list so I am unsure as to why each element is not accessible.
any ideas how to make it so as I can easily make requests
lines = [
'.bash_history',
'.bashrc',
'.cache',
'.config',
'.cvs',
'.cvsignore',
'.forward',
'.git/HEAD',
'.history',
'.hta',
]
for line in lines:
full_url = []
full_url.append('https://google.com/' + line)
print(full_url[0])
print(type(full_url))

Move full_url = [] before the loop:
filename = '/Users/Desktop/common.txt'
nsip = 'google.com'
with open(filename, 'r') as file:
full_url = []
for line in file:
linefinal = line.rstrip()
full_url.append("https://" + nsip + '/' + linefinal)
print(full_url)
which gives:
['https://google.com/.bash_history', 'https://google.com/.bashrc', 'https://google.com/.cache', 'https://google.com/.config', 'https://google.com/.cvs', 'https://google.com/.cvsignore', 'https://google.com/.forward', 'https://google.com/.git/HEAD', 'https://google.com/.history', 'https://google.com/.hta']
Is that what you were after?

Related

while web-scraping looping through each url to make a separate text file for every url data getting error

`for i in urls:
text=fetch_text(i)
listToStr = ' '.join([str(elem) for elem in text])
result = re.sub(r'<.*?>', '', listToStr)
basename = "file_"
file_name = ["{}_{}.txt".format(basename, j) for j in range(37,151)]
with open(file_name[i], 'w') as f: ---->Error
f.write(result)`
i wrote above code to fetch data through each URL and want to create a separate file for every URL data .but getting an error at "with the open line..for file_name as "list indices must be integers or slices, not str" ...
can someone help me through it?
You need to add a loop to iterate over file_name
Then I changed the iteration variable of urls to url instead of i , since i should be used for indexes and here you are directly iterating elements.
Now in your case you have one result for each url, but you generate several files inside each url and you store the same result on them.
This looks wrong, so you should really check the logic of your loops
for i, url in enmerate(urls):
text=fetch_text(url)
listToStr = ' '.join([str(elem) for elem in text])
result = re.sub(r'<.*?>', '', listToStr)
basename = "file_"
file_name = f"{basename}_{i+37}.txt"
with open(file_name, 'w') as f:
f.write(result)

How to add strings in the file at the end of all lines

I am trying to download files using python and then add lines at the end of the downloaded files, but it returns an error:
f.write(data + """<auth-user-pass>
TypeError: can't concat str to bytes
Edit: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
I also tried something like this but it also did not work: f.write(str(data) + "< auth-user-pass >")
here is my full code:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
for r in results:
print(r)
EDIT: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
Try this:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data)
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
with open(complete_path, 'ab') as f:
f.write(b"""<auth-user-pass>
username
password
</auth-user-pass>""")
for r in results:
print(r)
You are using binary mode, encode your string before concat, that is replace
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
using
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""".encode())
You open the file as a write in binary.
Because of that you cant use normal strings like the comment from #user56700 said.
You either need to convert the string or open it another way(ex. 'a' = appending).
Im not completly sure but it is also possible that the write binary variant of open the data of the file deletes. Normally open with write deletes existing data, so its quite possible that you need to change it to 'rwb'.

Eliminate specific number in a data file using Python

I have a large file and I want to delete all the values '24' within the data file. I have used this code but it doesn't do what I want. Suggestions please. Thanks
This is the data file
24,24,24,24,24,24,1000,1000,24,24,24,1000,1000,1000,1000,24,24,24,24,24,24,24,24,24,24,1000,1000,1000,1000,1000,1000,1000,1000,24,24,24,24,1000,1000,1000,1000,24,1000,24,24,24,24,1000,1000,1000,1000,1000,24,24,24,24,24,24,1000,24,24,24,24,1000,1000,1000,1000,1000,1000,1000,1000,1000,24,24,24,24,1000,1000,1000,1000,24,1000,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,1000,1000,24,24,24,24,24,24,1000,1000,1000,24,24,24,24,1000,1000,1000,1000,1000,1000,1000,1000,1000,24,24,24,24,24,24,24,24,24,24,24,24,24,1000,1000,24,24,24,24,24,24,24,24,24,1000,1000,1000,24,24,24,1000,24,24,1000,1000,24,24,24,24,1000,1000,1000,1000,1000,1000,1000,24,24,24,1000,1000,1000,1000,1000,1000,24,24,24,1000,1000,1000,1000,1000,1000,1000,24,24,24,24,1000,1000,24,1000,1000,24,24,1000,1000,1000,1000,1000,1000,1000,24,24,24,1000,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,1000,1000,24,24,24,1000,1000,1000,1000,1000,24,24,24,24,24,24,24,24,1000,1000,1000,1000,1000,24,24,24,24,24,24,1000,24,24,24,24,24,24,24,24,24,1000,1000,1000,1000,1000,1000,24,24,24,24,24,24,24,24,24,24,1000,1000,1000,24,1000,1000,1000,1000,24,24,1000,1000,24,24,24,24,24,24,24,1000,24,24,24,24,24,24,1000,1000,1000,1000,1000,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,1000,1000,1000,1000,1000
Code
content = open('txt1.txt', 'r').readlines()
cleandata = []
for line in content:
line = {i:None for i in line.replace("\n", "").split()}
for value in line.copy():
if value == "24":
line.pop(value)
cleandata.append(" ".join(line) + "\n")
open('txt2.txt', 'w').writelines(cleandata)
This should do it:
content = open('txt1.txt', 'r').readlines()
cleandata = []
for line in content:
line = line.replace('24','')
cleandata.append(line)
open('txt2.txt', 'w').writelines(cleandata)
You could use a regex for it, to match the 24 and delete it.
import re
regex24 = re.compile(r"\b24,?\b")
f = open('txt1.txt', 'r')
cleans = [regex24.sub("", line) for line in f.readlines()]
open('txt2.txt', 'w').writelines(cleans)

Download multiple files in Python loop, only first works

I am trying to download few videos from list of links.
Every line in text file is one link.
When I try to download all videos in loop, only first one is working.
Videos are from 60 - 100 MB.
Loop continues afterwards, but files are empty.
Thank you for help.
def download():
name = 'video'
a = 1
with open('download.txt') as f:
lines = f.readlines()
for line in lines:
url = line
response = requests.get(url, stream=True)
name = name + str(a)
filename = name + '.mp4'
with open(filename, 'wb') as f:
f.write(response.content)
a = a + 1
def download():
name = 'video'
a = 1
with open('download.txt') as f:
lines = f.readlines()
for line in lines:
urllib.request.urlretrieve(line.strip(), name + str(a) + ".mpg")
a += 1
This code worked for me.
Depending on your purpose, you may want to account for security, robustness (what happens if one download fails?), performance (concurrency?).

Python -- How to split headers/chapters into separate files automatically

I'm converting text directly to epub and I'm having a problem automatically splitting the HTML book file into separate header/chapter files. At the moment, the code below partially works but only creates every other chapter file. So half the header/chapter files are missing from the output. Here is the code:
def splitHeaderstoFiles(fpath):
infp = open(fpath, 'rt', encoding=('utf-8'))
for line in infp:
# format and split headers to files
if '<h1' in line:
#-----------format header file names and other stuff ------------#
# create a new file for the header/chapter section
path = os.getcwd() + os.sep + header
with open(path, 'wt', encoding=('utf-8')) as outfp:
# write html top meta headers
outfp = addMetaHeaders(outfp)
# add the header
outfp = outfp.write(line)
# add the chapter/header bodytext
for line in infp:
if '<h1' not in line:
outfp.write(line)
else:
outfp.write('</body>\n</html>')
break
else:
continue
infp.close()
The problem occurs in the second 'for loop' at the bottom of the code, when I look for the next h1 tag to stop the split. I cannot use seek() or tell() to rewind or move back one line so the program can find the next header/chapter on the next iteration. Apparently you cannot use these in python in a for loop containing an implicit iter or next object in operation. Just gives a 'can't do non-zero cur-relative seeks' error.
I've also tried the while line != ' ' + readline() combination in the code which also gives the same error as above.
Does anyone know an easy way to split HTML headers/chapters of varying lengths into separate files in python? Are there any special python modules(such as pickles) that could help make this task easier?
I'm using Python 3.4
My grateful thanks in advance for any solutions to this problem...
I ran into similar problem a while ago, here is a simplified solution:
from itertools import count
chapter_number = count(1)
output_file = open('000-intro.html', 'wb')
with open('index.html', 'rt') as input_file:
for line in input_file:
if '<h1' in line:
output_file.close()
output_file = open('{:03}-chapter'.format(next(chapter_number)), 'wb')
output_file.write(line)
output_file.close()
In this approach, the first block of text leading to the first h1 block is written into 000-intro.html, the first chapter will be written into 001-chapter.html and so on. Please modify it to taste.
The solution is a simple one: Upon encountering the h1 tag, close the last output file and open a new one.
You are looping over your input file twice, which is likely causing your problems:
for line in infp:
...
with open(path, 'wt', encoding=('utf-8')) as outfp:
...
for line in infp:
...
Each for is going to have it's own iterator, so you are going to loop over the file many times.
You might try transforming your for loop into a while so you're not using two different iterators:
while infp:
line = infp.readline()
if '<h1' in line:
with open(...) as outfp:
while infp:
line = infp.readline()
if '<h1' in line:
break
outfp.writeline(...)
Alternatively, you may wish to use an HTML parser (i.e., BeautifulSoup). Then you can do something like what is described here: https://stackoverflow.com/a/8735688/65295.
Update from comment - essentially, read the entire file all at once so you can freely move back or forward as necessary. This probably won't be a performance issue unless you have a really really big file (or very little memory).
lines = infp.readlines() # read the entire file
i = 0
while i < len(lines):
if '<h1' in lines[i]:
with open(...) as outfp:
j = i + 1
while j < len(lines):
if '<h1' in lines[j]:
break
outfp.writeline(lines[j])
# line j has an <h1>, set i to j so we detect the it at the
# top of the next loop iteration.
i = j
else:
i += 1
I eventually found the answer to the above problem. The code below does alot more that just get the file header. It also simultaneously loads two parallel list arrays with formatted file name data(with extension) and pure header name data respectively so I can use these lists to fill in the and formatted filename extension in these html files within a while loop in one hit. The code now works well and is shown below.
def splitHeaderstoFiles(dir, inpath):
count = 1
t_count = 0
out_path = ''
header = ''
write_bodytext = False
file_path_names = []
pure_header_names = []
inpath = dir + os.sep + inpath
with open(inpath, 'rt', encoding=('utf-8')) as infp:
for line in infp:
if '<h1' in line:
#strip html tags, convert to start caps
p = re.compile(r'<.*?>')
header = p.sub('', line)
header = capwords(header)
line_save = header
# Add 0 for count below 10
if count < 10:
header = '0' + str(count) + '_' + header
else:
header = str(count) + '_' + header
# remove all spaces + add extension in header
header = header.replace(' ', '_')
header = header + '.xhtml'
count = count + 1
#create two parallel lists used later
out_path = dir + os.sep + header
outfp = open(out_path, 'wt', encoding=('utf-8'))
file_path_names.insert(t_count, out_path)
pure_header_names.insert(t_count, line_save)
t_count = t_count + 1
# Add html meta headers and write it
outfp = addMainHeaders(outfp)
outfp.write(line)
write_bodytext = True
# add header bodytext
elif write_bodytext == True:
outfp.write(line)
# now add html titles and close the html tails on all files
max_num_files = len(file_path_names)
tmp = dir + os.sep + 'temp1.tmp'
i = 0
while i < max_num_files:
outfp = open(tmp, 'wt', encoding=('utf-8'))
infp = open(file_path_names[i], 'rt', encoding=('utf-8'))
for line in infp:
if '<title>' in line:
line = line.strip(' ')
line = line.replace('<title></title>', '<title>' + pure_header_names[i] + '</title>')
outfp.write(line)
else:
outfp.write(line)
# add the html tail
if '</body>' in line or '</html>' in line:
pass
else:
outfp.write(' </body>' + '\n</html>')
# clean up
infp.close()
outfp.close()
shutil.copy2(tmp, file_path_names[i])
os.remove(tmp)
i = i + 1
# now rename just the title page
if os.path.isfile(file_path_names[0]):
title_page_name = file_path_names[0]
new_title_page_name = dir + os.sep + '01_Title.xhtml'
os.rename(title_page_name, new_title_page_name)
file_path_names[0] = '01_Title.xhtml'
else:
logmsg27(DEBUG_FLAG)
os._exit(0)
# xhtml file is no longer needed
if os.path.isfile(inpath):
os.remove(inpath)
# returned list values are also used
# later to create epub opf and ncx files
return(file_path_names, pure_header_names)
#Hai Vu and #Seth -- Thanks for all your help.

Categories