Delete lines from a txt file after read - python

i'm trying to create a script which makes requests to random urls from a txt file
import urllib2
with open('urls.txt') as urls:
for url in urls:
try:
r = urllib2.urlopen(url)
except urllib2.URLError as e:
r = e
if r.code in (200, 401):
print '[{}]: '.format(url), "Up!"
elif r.code == 404:
print '[{}]: '.format(url), "Not Found!"
But I want that when some url does 404 not found erase from the file. Each url is per line, so basically is to erase every url that does 404 not found. How to do it?!

You could write to a second file:
import urllib2
with open('urls.txt', 'r') as urls, open('urls2.txt', 'w') as urls2:
for url in urls:
try:
r = urllib2.urlopen(url)
except urllib2.URLError as e:
r = e
if r.code in (200, 401):
print '[{}]: '.format(url), "Up!"
urls2.write(url + '\n')
elif r.code == 404:
print '[{}]: '.format(url), "Not Found!"

In order to delete lines from a file, you have to rewrite the entire contents of the file. The safest way to do that is to write out a new file in the same directory and then rename it over the old file. I'd modify your code like this:
import os
import sys
import tempfile
import urllib2
good_urls = set()
with open('urls.txt') as urls:
for url in urls:
try:
r = urllib2.urlopen(url)
except urllib2.URLError as e:
r = e
if r.code in (200, 401):
sys.stdout.write('[{}]: Up!\n'.format(url))
good_urls.add(url)
elif r.code == 404:
sys.stdout.write('[{}]: Not found!\n'.format(url))
else:
sys.stdout.write('[{}]: Unexpected response code {}\n'.format(url, r.code))
tmp = None
try:
tmp = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', dir='.', delete=False)
for url in sorted(good_urls):
tmp.write(url + "\n")
tmp.close()
os.rename(tmp.name, 'urls.txt')
tmp = None
finally:
if tmp is not None:
os.unlink(tmp.name)
You may want to add a good_urls.add(url) to the else clause in the first loop. If anyone knows a tidier way to do what I did with try-finally there at the end, I'd like to hear about it.

Related

How to get simple threading to work Python

i want to know how i can add simple threading to my code. At the moment it checks just one by one, and if some site isnt reachable it will wait for the timeout before it will continue with the next one this slows everything down.
import requests
import sys
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
with open("websites.txt", 'r') as websites:
websites = websites.read().splitlines()
with open("para1.txt", 'r') as para1:
para1 = para1.read().splitlines()
with open("para2.txt", 'r') as para2:
para2 = para2.read().splitlines()
def main():
for i in para1:
for j in para2:
for m in websites:
try:
res = requests.get(m + i + j, verify=False, timeout=10)
print(m + i + j)
if res.status_code == 200:
print('Yes')
else:
print('No')
except Exception as e:
print(e)
except KeyboardInterrupt:
sys.exit()
finally:
res.close()
time.sleep(1)
if __name__ == '__main__':
main()
You can apply ThreadPoolExecutor moving part of code which perform requests to separate function and pass it as argument:
import urllib3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def check_func(url):
response = requests.get(url, verify=False, timeout=10)
return response.status_code == 200
def main():
with open("websites.txt") as website_f, open("para1.txt") as para1_f,
open("para2.txt", 'r') as para2_f, ThreadPoolExecutor(max_workers=4) as executor:
tasks = {}
for website in website_f:
for para1 in para1_f:
for para2 in para2_f:
url = website.rstrip() + para1.rstrip() + para2.rstrip()
tasks[executor.submit(check_func, url)] = url
for task in as_completed(tasks):
url = tasks[task]
try:
result = task.result()
except KeyboardInterrupt: # handling Ctrl + C
for task in tasks:
task.cancel() # won't cancel already finished or pending futures
except CancelledError: # will never happen (normally)
pass
except Exception as e:
print(url, "-", "ERROR", e)
else:
print(url, "-", "GOOD" if result else "BAD")
if __name__ == "__main__":
main()
P.S. I haven't tested entire code so if there're any problems with it - write in comments.

Python Pixabay API

I write a script to download the 20000 images from Pixabay using Pixabay API. But the problem arises after scraping the 600th images and shows the following error:
"File "c:/Users/Dell/Desktop/python-pixabay-master/python-pixabay-master/main.py", line 26, in
pretty="true"
File "c:\Users\Dell\Desktop\python-pixabay-master\python-pixabay-master\pixabay.py", line 144, in search
raise ValueError(resp.text)
ValueError: [ERROR 400] "page" is out of valid range."
Code:
from pixabay import Image, Video
import pprint
import requests
import shutil
API_KEY = 'myAPIkeys'
image = Image(API_KEY)
j=1
for n in range(1,100):
ims = image.search(
q="education",
lang="en",
image_type="all",
orientation="all",
category="education",
min_width=0,
min_height=0,
colors="",
editors_choice="false",
safesearch="false",
order="popular",
page=n,
per_page=200,
callback="",
pretty="true"
)
#hits=ims['total']
#print(hits)
#print(ims)
#pp=pprint.PrettyPrinter(indent=4)
for i in range(0,200):
payload=ims['hits'][i]['largeImageURL']
resp = requests.get(payload, stream=True)
local_file = open(str(j)+"local_image.jpg", 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
print(str(j)+"URL of image: {}".format(payload))
j=j+1
#urllib.request.urlretrieve(payload,i)
#pp.pprint(ims)
#Pakshadow Hi I have tried your API in postman, You can see that in the API there is the one parameter page, So In your case your all the images are cover till page 3 so that on page 4 it's giving page is out of valid range error from pixabay.
you can try with this postman link: https://www.getpostman.com/collections/2823a8aad5ea81b55342
Import it and check that.
You can handle this error using exception.
from pixabay import Image
import requests
import shutil
API_KEY = 'API_KEY'
image = Image(API_KEY)
j=1
for n in range(1,100):
try:
ims = image.search(
q="education",
lang="en",
image_type="all",
orientation="all",
category="education",
min_width=0,
min_height=0,
colors="",
editors_choice="false",
safesearch="false",
order="popular",
page=n,
per_page=200,
callback="",
pretty="true"
)
for i in range(0, 200):
payload = ims['hits'][i]['largeImageURL']
resp = requests.get(payload, stream=True)
local_file = open(str(j) + "local_image.jpg", 'wb')
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, local_file)
del resp
print(str(j) + "URL of image: {}".format(payload))
j = j + 1
# urllib.request.urlretrieve(payload,i)
# pp.pprint(ims)
except Exception as e:
print(e)

Change a while true python script to run only once

I'm new to python and I want this code to run only once and stops, not every 30 seconds
because I want to run multiple codes like this with different access tokens every 5 seconds using the command line.
and when I tried this code it never jumps to the second one because it's a while true:
import requests
import time
api_url = "https://graph.facebook.com/v2.9/"
access_token = "access token"
graph_url = "site url"
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
# Beware of rate limiting if trying to increase frequency.
refresh_rate = 30 # refresh rate in second
while True:
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open ("open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
time.sleep(refresh_rate)
From what I understood you're trying to execute the piece of code for multiple access tokens. To make your job simple, have all your access_tokens as lists and use the following code. It assumes that you know all your access_tokens in advance.
import requests
import time
def scrape_facebook(api_url, access_token, graph_url):
""" Scrapes the given access token"""
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open (access_token+"_"+"open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
access_token = ['a','b','c']
graph_url = ['sss','xxx','ppp']
api_url = "https://graph.facebook.com/v2.9/"
for n in range(len(graph_url)):
scrape_facebook(api_url, access_token[n], graph_url[n])
time.sleep(5)

Python appending to binary file works in Unix, not in Windows

The function below receives file chunks from a web requests and assembles them. It works perfectly in Unix (OSX) but on Windows, it doesn't. Specifically, the file does assemble, however it always ends up too small, just a few KB. I cannot figure out what is causing this. No exceptions are raised, it all appears to work, except that the final file is not all there. I've included the entire function for context but I've marked the section which appears not to be working correctly. (Python 2.7 and Windows Server 2008 R2)
#view_config(route_name='upload', renderer='json')
def upload(request):
r = request.response
final_dir = 'w:\\foobar'
filename = request.params.get('flowFilename')
chunk_number = request.params.get('flowChunkNumber')
total_chunks = request.params.get('flowTotalChunks')
try:
temp_dir = os.path.join(final_dir, request.params.get('flowIdentifier'))
file_part = os.path.join(temp_dir, '%s.part.%s' % (filename, chunk_number))
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
except TypeError:
pass
if request.method == 'GET':
if file_part:
if os.path.isfile(file_part):
r.status = 200
else:
r.status = 404
return r
if request.POST:
try:
fo = request.params.get('file').file
f = open(file_part, 'wb')
f.write(fo.read())
f.close()
if chunk_number == total_chunks:
final_filename = os.path.join(final_dir, filename)
temp_filename = filename + '_INCOMPLETE'
#####################################################################
# This is where is appears to be going wrong...
final_file = open(temp_filename, 'a+b')
try:
for i in range(1, int(total_chunks) + 1):
ff = open(os.path.join(temp_dir, '%s.part.%s' % (filename, i)))
final_file.write(ff.read())
ff.close()
final_file.close()
os.rename(temp_filename, final_filename) # rename to final filename
shutil.rmtree(temp_dir) # clean up temp part files
except:
raise
####################################################################
r.status = 200
except Exception, e:
print 'ERROR', e.message
r.status = 404
return r

Dwnload map image with python

I am trying to download map image in python with urllib module. But it always failed.
I'm tried to use urllib.urlopen() with some parameter variants
tried in urllib.urlretrieve()
But it doesn't work. And, When I see the source code of image url, I didn't find image file. Here is image: https://maps.googleapis.com/maps/api/staticmap?center=31.0456,121.3997&zoom=12&size=320x385&sensor=false
Source code:
#-------------------------- PARSE IP ADDRESS -------------------------------
import re
import urllib
try:
mysite = urllib.urlopen('http://ip-api.com/line')
except urllib.HTTPError, e:
print "Cannot retrieve URL: HTTP Error Code", e.code
except urllib.URLError, e:
print "Cannot retrieve URL: " + e.reason[1]
list_of_params = mysite.read()
print list_of_params
ip_arr = list_of_params.splitlines()
#--------------------- HERE IS FIND MAP IMAGE --------------------------------------
try:
map_page = urllib.urlopen('http://ip-api.com')
except urllib.HTTPError, e:
print "Cannot retrieve URL: HTTP Error Code", e.code
except urllib.URLError, e:
print "Cannot retrieve URL: " + e.reason[1]
#f = open("data.html", "w")
#f.write(str(mysite.read()))
#f.close()
#looking for this in page
pattern = re.findall(re.compile("url\(\'(https://maps\.googleapis\.com/maps/api/staticmap\?center=.*)\'"), page_get_map.read())
map_img_url = pattern[0].replace('&', '&')
#------------------- DOWNLOAD MAP IMAGE And SAVE IT ------------------------
#file_name = map_img_url.rsplit('/',1)[1]
try:
get_map_img = urllib.urlretrieve(map_img_url, "staticmap.png")
except urllib.HTTPError, e:
print "Cannot retrieve URL: HTTP Error Code", e.code
except urllib.URLError, e:
print "Cannot retrieve URL: " + e.reason[1]
i = open("pict.png", "w")
i.write(get_map_img.read())
i.close()
print "End of file"
import requests
f=open('static.png','wb')
f.write(requests.get('https://maps.googleapis.com/maps/api/staticmap?center=31.0456,121.3997&zoom=12&size=320x385&sensor=false').content)
f.close()
Why are you parsing the map URL? Construct it yourself:
import json, urllib
query = '' # IP to get coordinates of, leave empty for current IP
geo = urllib.urlopen('http://ip-api.com/json/%s?fields=240' % query)
result = json.load(geo)
if result['zip']:
zoom = 13
elif result['city']:
zoom = 12
else:
zoom = 6
map_img_url = "https://maps.googleapis.com/maps/api/staticmap?center=%s,%s&zoom=%i&size=320x385&sensor=false" % (result['lat'], result['lon'], zoom)
get_map_img = urllib.urlretrieve(map_img_url, "staticmap.png")

Categories