Trouble multiprocessing - "The parameter is incorrect" on p.start() - python

I've been having a lot of trouble multiprocessing - I've literally been trying for hours and can't get it right. Here's my code, commented the best I could do.
Linked all my code as I don't know what's causing it exactly.
Line 74 it says, on p.start()
The most relevant part of code is the bottom of the question.
Here are my imports
import urllib
import socket
import multiprocessing as mp
import queue
import requests
Header used for higher chance of success upon connecting to a website
headers={'User-agent' : 'Mozilla/5.0'}
Main function takes four parameters - queue, the URL List, the Output file, and the list of vulnerable URLs.
def mainFunction(q, URLList, Output, vulnURLS):
This list is used to check if the page source has any of the errors in the list after adding a string query to the end of the url (')
queries = ['SQL syntax', 'mysql_fetch', 'mysql_num_rows', 'mySQL Error', 'mySQL_connect()', 'UNION SELECT', 'MySQL server version']
This puts the URL in the correct format before testing for injection points.
URLReplace = [("['", ""),("']",""), ("\n", ""), ("https://","http://"), ("\s", "%20"), ("\s", "%20")]
URL = ''.join(str(URLList))
for URL in URLList:
if (z < len(URLReplace)):
URL = URL.replace(URLReplace[z])
z = z + 1
URL = (URL + "'")
This is the try request, where it attempts to connect and scrapes the HTML off of the webpage.
try:
req = requests.get(URL, timeout=2)
htmlObject = urllib.request.urlopen(URL)
This iterates through the list to check for any possible vulnerabilities. Also returns 404/400 messages.
if (y < len(queries)):
if queries[x] in htmlObject:
print ("\t [+] " + URL)
vulnURLS.append(URL)
Output.open()
for VURLS in vulnURLS:
Output.write(VURLS + '\n')
Output.close()
y = y + 1
else:
print ("\t [-] " + URL)
except urllib.error.HTTPError as e:
if e.code == 404:
print("\t [-] Page not found.")
if e.code == 400:
print ("\t [+] " + URL)
except urllib.error.URLError as e:
print("\t [-] URL Timed Out")
except socket.timeout as e:
print("\t [-] URL Timed Out")
except socket.error as e:
print("\t [-] Error in URL")
Here's the important part, where I use the Queue & multiprocessor.
if __name__=='__main__':
q = mp.Queue()
URLList = [i.strip().split() for i in open('sites.txt').readlines()]
Output = open('output.txt', 'r')
vulnURLS = []
p = mp.Process(target=mainFunction, args=(q, URLList, Output, vulnURLS))
p.start()
q.put(mainFunction(URLList))
q.close()
q.join_thread()
p.join()
Please help me out with this problem, I've been stuck on it for hours and am getting very frustrated that I cannot follow the solution. Every module I look at I follow to a T and get this same error.
I have tried multi-threading, but it is extremely slow and unstable when compared to multiprocessing.

Change to the following:
p = mp.Process(target=mainFunction, args=(q, Output))
p.start()
for url in URLList:
q.put(url)

Related

Change a while true python script to run only once

I'm new to python and I want this code to run only once and stops, not every 30 seconds
because I want to run multiple codes like this with different access tokens every 5 seconds using the command line.
and when I tried this code it never jumps to the second one because it's a while true:
import requests
import time
api_url = "https://graph.facebook.com/v2.9/"
access_token = "access token"
graph_url = "site url"
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
# Beware of rate limiting if trying to increase frequency.
refresh_rate = 30 # refresh rate in second
while True:
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open ("open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
time.sleep(refresh_rate)
From what I understood you're trying to execute the piece of code for multiple access tokens. To make your job simple, have all your access_tokens as lists and use the following code. It assumes that you know all your access_tokens in advance.
import requests
import time
def scrape_facebook(api_url, access_token, graph_url):
""" Scrapes the given access token"""
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open (access_token+"_"+"open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
access_token = ['a','b','c']
graph_url = ['sss','xxx','ppp']
api_url = "https://graph.facebook.com/v2.9/"
for n in range(len(graph_url)):
scrape_facebook(api_url, access_token[n], graph_url[n])
time.sleep(5)

How to do this (check requests status codes) in Python concurrently or in parallel?

Here's what I'm doing:
Get words from a text file - every word is on a separate line.
Add http://www. and .com to words to create a url.
Get the URL with requests.
Find out if it's a free domain or not (based on status codes and
error in connection/other error).
Add free domains to a text file.
Time it all.
I've kind of made it work so far but it's very slow. The text file has 350 000 words. How would I go about doing this concurrently or in parallel? Also which would be a better choice for this task?
Here's my code:
import requests, time
start = time.time()
with open('words1.txt','r') as f:
words = []
for item in f:
words.append(item.strip())
for w in words:
url = 'http://www.'+w+'.com'
try:
header = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, headers=header)
codes = [200,201,202,203,204,205,206,300,301,302,303,307,308,400,401,402,403,404,405,406,500,501,502,503]
if r.status_code in codes:
print(url,': Known Status Code > Unavailable')
else:
print(url,': Unknown Status Code > Probably Free')
with open('available.txt','a') as myfile:
myfile.write(url+'\n')
except requests.exceptions.ConnectionError:
print(url,' : Connection Error > Probably Free')
with open('available.txt','a') as myfile:
myfile.write(url+'\n')
except requests.exceptions.HTTPError:
print('http error')
except requests.exceptions.Timeout:
print('timeout error')
except requests.exceptions.TooManyRedirects:
print('too many redirects')
end = time.time()
print('\n')
print(end-start, 'seconds')
print((end-start)/60,'minutes')
print(((end-start)/60)/60,'hours')
Thanks!
EDIT: I got it to work. Thanks for the help Kendas and DeepSpace!
Here's a quick test:
100 words - 22 sec
1000 words - 285 sec
Not too fast but way faster than my first try.
Seems like gevent + socket is the way to go.
Please let me know if you have any tips on making this better/faster.
Here's the code:
import gevent,time
from gevent import socket
start = time.time()
words = []
with open('words1000.txt','r') as f:
for item in f:
words.append(item.strip())
urls = ['www.{}.com'.format(w) for w in words]
jobs = [gevent.spawn(socket.gethostbyname, url) for url in urls]
gevent.joinall(jobs)
values = {url:job.value for (url,job) in zip(urls,jobs)}
freeDomains = []
for (v,job,url) in zip(values,jobs,urls):
if job.value == None:
freeDomains.append(url)
with open('availableds.txt','a') as myFile:
myFile.write(url+'\n')
print(freeDomains)
end = time.time()
print(end-start,'seconds')
print((end-start)/60,'minutes')
print((end-start)/3600,'hours')
grequests (the concurrent version of requests) makes this pretty easy.
It will also help to use .format and not redefining header every iteration
import grequests
def exception_handler(request, exception):
print(exception)
with open('words1.txt','r') as f:
words = []
for item in f:
words.append(item.strip())
urls = ['http://www.{}.com'.format(w) for w in words]
header = {'User-Agent': 'Mozilla/5.0'}
requests = [grequests.get(url) for url in urls]
responses = grequests.map(requests, exception_handler=exception_handler)
for resp in responses:
if resp:
print(resp.status_code)

Python custom 404 response error

I wrote a hiscore checker for a game that I play, basically you enter a list of usernames into the .txt file & it outputs the results in found.txt.
However if the page responds a 404 it throws an error instead of returning output as " 0 " & continuing with the list.
Example of script,
#!/usr/bin/python
import urllib2
def get_total(username):
try:
req = urllib2.Request('http://services.runescape.com/m=hiscore/index_lite.ws?player=' + username)
res = urllib2.urlopen(req).read()
parts = res.split(',')
return parts[1]
except urllib2.HTTPError, e:
if e.code == 404:
return "0"
except:
return "err"
filename = "check.txt"
accs = []
handler = open(filename)
for entry in handler.read().split('\n'):
if "No Displayname" not in entry:
accs.append(entry)
handler.close()
for account in accs:
display_name = account.split(':')[len(account.split(':')) - 1]
total = get_total(display_name)
if "err" not in total:
rStr = account + ' - ' + total
handler = open('tried.txt', 'a')
handler.write(rStr + '\n')
handler.close()
if total != "0" and total != "49":
handler = open('found.txt', 'a')
handler.write(rStr + '\n')
handler.close()
print rStr
else:
print "Error searching"
accs.append(account)
print "Done"
HTTPERROR exception that doesn't seem to be working,
except urllib2.HTTPError, e:
if e.code == 404:
return "0"
except:
return "err"
Error response shown below.
Now I understand the error shown doesn't seem to be related to a response of 404, however this only occurs with users that return a 404 response from the request, any other request works fine. So I can assume the issue is within the 404 response exception.
I believe the issue may lay in the fact that the 404 is a custom page which you get redirected too?
so the original page is " example.com/index.php " but the 404 is " example.com/error.php "?
Not sure how to fix.
For testing purposes, format to use is,
ID:USER:DISPLAY
which is placed into check.txt
It seems that total can end up being None. In that case you can't check that it has 'err' in it. To fix the crash, try changing that line to:
if total is not None and "err" not in total:
To be more specific, get_total is returning None, which means that either
parts[1] is None or
except urllib2.HTTPError, e: is executed but e.code is not 404.
In the latter case None is returned as the exception is caught but you're only dealing with the very specific 404 case and ignoring other cases.

NameError: name 'result' is not defined. Can't figure out how to get the results to the next line

I'm a complete beginner in Python, trying to get a script to work, but I'm a little at loss on where it goes wrong. From reading other posts it seems result hasn't been mentioned before and it doesn't know how to deal with the results.
I'm running Python 2.7.11 on EL Capitan
python ytc.py -v YQHsXMglC9A [*] Retrieving video ID: YQHsXMglC9A [*]
Thumbnails retrieved. Now submitting to TinEye. [*] Searching TinEye
for: https://i.ytimg.com/vi/YQHsXMglC9A/default.jpg Traceback (most
recent call last): File "ytc.py", line 72, in <module>
if result.total_results: NameError: name 'result' is not defined
This is the script:
import argparse
import requests
import json
from pytineye import TinEyeAPIRequest
tineye = TinEyeAPIRequest('http://api.tineye.com/rest/','PUBLICKEY','PRIVATEKEY')
youtube_key = "MY-API"
ap = argparse.ArgumentParser()
ap.add_argument("-v","--videoID", required=True,help="The videoID of the YouTube video. For example: https://www.youtube.com/watch?v=VIDEOID")
args = vars(ap.parse_args())
video_id = args['videoID']
#
# Retrieve the video details based on videoID
#
def youtube_video_details(video_id):
api_url = "https://www.googleapis.com/youtube/v3/videos?part=snippet%2CrecordingDetails&"
api_url += "id=%s&" % video_id
api_url += "key=%s" % youtube_key
response = requests.get(api_url)
if response.status_code == 200:
results = json.loads(response.content)
return results
return None
print "[*] Retrieving video ID: %s" % video_id
video_data = youtube_video_details(video_id)
thumbnails = video_data['items'][0]['snippet']['thumbnails']
print "[*] Thumbnails retrieved. Now submitting to TinEye."
url_list = []
# add the thumbnails from the API to the list
for thumbnail in thumbnails:
url_list.append(thumbnails[thumbnail]['url'])
# build the manual URLS
for count in range(4):
url = "http://img.youtube.com/vi/%s/%d.jpg" % (video_id,count)
url_list.append(url)
results = []
# now walk over the list of URLs and search TinEye
for url in url_list:
print "[*] Searching TinEye for: %s" % url
try:
result = tineye.search_url(url)
except:
pass
if result.total_results:
results.extend(result.matches)
result_urls = []
dates = {}
for match in results:
for link in match.backlinks:
if link.url not in result_urls:
result_urls.append(link.url)
dates[link.crawl_date] = link.url
print
print "[*] Discovered %d unique URLs with image matches." % len(result_urls)
for url in result_urls:
print url
oldest_date = sorted(dates.keys())
print
print "[*] Oldest match was crawled on %s at %s" % (str(oldest_date[0]),dates[oldest_date[0]])
If the try-except fails, it will execute the except block which has only pass, no assignment of variable result, so if that's the case, on if result.total_results, you are referencing an object which does not exist.
This should be a quick fix
try:
result = tineye.search_url(url)
except NameError:
print 'Nothing Found !'
break
if result.total_results:
results.extend(result.matches)
The error is clear, result variable is used while its not defined. this happen in the case of failure in your try except instruction.
Fix it by moving the instruction into the try block :
for url in url_list:
print "[*] Searching TinEye for: %s" % url
try:
result = tineye.search_url(url)
if result.total_results:
results.extend(result.matches)
except:
pass

Dwnload map image with python

I am trying to download map image in python with urllib module. But it always failed.
I'm tried to use urllib.urlopen() with some parameter variants
tried in urllib.urlretrieve()
But it doesn't work. And, When I see the source code of image url, I didn't find image file. Here is image: https://maps.googleapis.com/maps/api/staticmap?center=31.0456,121.3997&zoom=12&size=320x385&sensor=false
Source code:
#-------------------------- PARSE IP ADDRESS -------------------------------
import re
import urllib
try:
mysite = urllib.urlopen('http://ip-api.com/line')
except urllib.HTTPError, e:
print "Cannot retrieve URL: HTTP Error Code", e.code
except urllib.URLError, e:
print "Cannot retrieve URL: " + e.reason[1]
list_of_params = mysite.read()
print list_of_params
ip_arr = list_of_params.splitlines()
#--------------------- HERE IS FIND MAP IMAGE --------------------------------------
try:
map_page = urllib.urlopen('http://ip-api.com')
except urllib.HTTPError, e:
print "Cannot retrieve URL: HTTP Error Code", e.code
except urllib.URLError, e:
print "Cannot retrieve URL: " + e.reason[1]
#f = open("data.html", "w")
#f.write(str(mysite.read()))
#f.close()
#looking for this in page
pattern = re.findall(re.compile("url\(\'(https://maps\.googleapis\.com/maps/api/staticmap\?center=.*)\'"), page_get_map.read())
map_img_url = pattern[0].replace('&', '&')
#------------------- DOWNLOAD MAP IMAGE And SAVE IT ------------------------
#file_name = map_img_url.rsplit('/',1)[1]
try:
get_map_img = urllib.urlretrieve(map_img_url, "staticmap.png")
except urllib.HTTPError, e:
print "Cannot retrieve URL: HTTP Error Code", e.code
except urllib.URLError, e:
print "Cannot retrieve URL: " + e.reason[1]
i = open("pict.png", "w")
i.write(get_map_img.read())
i.close()
print "End of file"
import requests
f=open('static.png','wb')
f.write(requests.get('https://maps.googleapis.com/maps/api/staticmap?center=31.0456,121.3997&zoom=12&size=320x385&sensor=false').content)
f.close()
Why are you parsing the map URL? Construct it yourself:
import json, urllib
query = '' # IP to get coordinates of, leave empty for current IP
geo = urllib.urlopen('http://ip-api.com/json/%s?fields=240' % query)
result = json.load(geo)
if result['zip']:
zoom = 13
elif result['city']:
zoom = 12
else:
zoom = 6
map_img_url = "https://maps.googleapis.com/maps/api/staticmap?center=%s,%s&zoom=%i&size=320x385&sensor=false" % (result['lat'], result['lon'], zoom)
get_map_img = urllib.urlretrieve(map_img_url, "staticmap.png")

Categories