Here is my code:
import urllib
import webbrowser
from bs4 import BeautifulSoup
import requests
import re
address = 'https://google.com/search?q='
# Default Google search address start
file = open( "OCR.txt", "rt" )
# Open text document that contains the question
word = file.read()
file.close()
myList = [item for item in word.split('\n')]
newString = ' '.join(myList)
# The question is on multiple lines so this joins them together with proper spacing
qstr = urllib.parse.quote_plus(newString)
# Encode the string
newWord = address + qstr
# Combine the base and the encoded query
response = requests.get(newWord)
#with open('output.html', 'wb') as f:
# f.write(response.content)
#webbrowser.open('output.html')
answers = open("ocr2.txt", "rt")
ansTable = answers.read()
answers.close()
ans = ansTable.splitlines()
ans1 = str(ans[0])
ans2 = str(ans[2])
ans3 = str(ans[4])
ans1Score = 0
ans2Score = 0
ans3Score = 0
links = []
soup = BeautifulSoup(response.text, 'lxml')
for r in soup.find_all(class_='r'):
linkRaw = str(r)
link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url")
if '&' in link:
finalLink = link.split('&')
link = str(finalLink[0])
links.append(link)
#print(links)
#print(' ')
for g in soup.find_all(class_='g'):
webBlock = str(g)
ans1Tally = webBlock.count(ans1)
ans2Tally = webBlock.count(ans2)
ans3Tally = webBlock.count(ans3)
if ans1 in webBlock:
ans1Score += ans1Tally
else:
ans1Found = False
if ans2 in webBlock:
ans2Score += ans2Tally
else:
ans2Found = False
if ans3 in webBlock:
ans3Score += ans3Tally
else:
ans3Found = False
if ans1Found and ans2Found and ans3Found is False:
searchLink = str(links[0])
if searchLink.endswith('pdf'):
pass
else:
response2 = requests.get(searchLink)
soup2 = BeautifulSoup(response2.text, 'lxml')
for p in soup2.find_all('p'):
extraBlock = str(p)
extraAns1Tally = extraBlock.count(ans1)
extraAns2tally = extraBlock.count(ans2)
extraAns3Tally = extraBlock.count(ans3)
if ans1 in extraBlock:
ans1Score += extraAns1Tally
if ans2 in extraBlock:
ans2Score += extraAns2Tally
if ans3 in extraBlock:
ans3Score += extraAns3Tally
with open("Results.txt", "w") as results:
results.write(newString + '\n\n')
results.write(ans1+": "+str(ans1Score)+'\n')
results.write(ans2+": "+str(ans2Score)+'\n')
results.write(ans3+": "+str(ans3Score))
links.pop(0)
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
Basically right now it is scraping each "g" one at a time, when this program can benefit massively from scraping each link all at the same time. For example, I want it to have them all scraping at the same time instead of waiting until the one before it is done. Sorry if this is a simple kind of question but I have little experience with asyncio so if anyone could help that would be massively appreciated. Thanks!
To write async program you need:
define functions with async def
call it with await
create event loop and run some function in it
run requests concurrently using asyncio.gather
All other is almost same as usual. Instead of using blocking request module you should use some async one. For example, aiohttp:
python -m pip install aiohttp
And use it like this:
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
return await resp.text()
Here's code with some changes I statrted. I didn't check if it's actually works since I don't have files you use. You should also move logic inside for g in soup.find_all(class_='g'): to seperate function and run multiple of these functions with asyncio.gather to benefit of asyncio.
import asyncio
import aiohttp
import urllib
import webbrowser
from bs4 import BeautifulSoup
import re
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get('https://api.github.com/events') as resp:
return await resp.text()
async def main():
address = 'https://google.com/search?q='
# Default Google search address start
file = open( "OCR.txt", "rt" )
# Open text document that contains the question
word = file.read()
file.close()
myList = [item for item in word.split('\n')]
newString = ' '.join(myList)
# The question is on multiple lines so this joins them together with proper spacing
qstr = urllib.parse.quote_plus(newString)
# Encode the string
newWord = address + qstr
# Combine the base and the encoded query
text = await get(newWord)
#with open('output.html', 'wb') as f:
# f.write(response.content)
#webbrowser.open('output.html')
answers = open("ocr2.txt", "rt")
ansTable = answers.read()
answers.close()
ans = ansTable.splitlines()
ans1 = str(ans[0])
ans2 = str(ans[2])
ans3 = str(ans[4])
ans1Score = 0
ans2Score = 0
ans3Score = 0
links = []
soup = BeautifulSoup(text, 'lxml')
for r in soup.find_all(class_='r'):
linkRaw = str(r)
link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url")
if '&' in link:
finalLink = link.split('&')
link = str(finalLink[0])
links.append(link)
#print(links)
#print(' ')
for g in soup.find_all(class_='g'):
webBlock = str(g)
ans1Tally = webBlock.count(ans1)
ans2Tally = webBlock.count(ans2)
ans3Tally = webBlock.count(ans3)
if ans1 in webBlock:
ans1Score += ans1Tally
else:
ans1Found = False
if ans2 in webBlock:
ans2Score += ans2Tally
else:
ans2Found = False
if ans3 in webBlock:
ans3Score += ans3Tally
else:
ans3Found = False
if ans1Found and ans2Found and ans3Found is False:
searchLink = str(links[0])
if searchLink.endswith('pdf'):
pass
else:
text2 = await get(searchLink)
soup2 = BeautifulSoup(text2, 'lxml')
for p in soup2.find_all('p'):
extraBlock = str(p)
extraAns1Tally = extraBlock.count(ans1)
extraAns2tally = extraBlock.count(ans2)
extraAns3Tally = extraBlock.count(ans3)
if ans1 in extraBlock:
ans1Score += extraAns1Tally
if ans2 in extraBlock:
ans2Score += extraAns2Tally
if ans3 in extraBlock:
ans3Score += extraAns3Tally
with open("Results.txt", "w") as results:
results.write(newString + '\n\n')
results.write(ans1+": "+str(ans1Score)+'\n')
results.write(ans2+": "+str(ans2Score)+'\n')
results.write(ans3+": "+str(ans3Score))
links.pop(0)
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
Upd:
Main idea is to move logic inside loop that does request into separate coroutine and pass multiple of these coroutines to asyncio.gather. It will parallelize your requests.
async def main():
# Her do all that are before the loop.
coros = [
process_single_g(g)
for g
in soup.find_all(class_='g')
]
results = await asyncio.gather(*coros) # this function will run multiple tasks concurrently
# and return all results together.
for res in results:
ans1Score, ans2Score, ans3Score = res
print(' ')
print('-----')
print(ans1+": "+str(ans1Score))
print(ans2+": "+str(ans2Score))
print(ans3+": "+str(ans3Score))
print('-----')
async def process_single_g(g):
# Here do all things you inside loop for concrete g.
text2 = await get(searchLink)
# ...
return ans1Score, ans2Score, ans3Score
Related
This question already has answers here:
Client.__init__() missing 1 required keyword-only argument: 'intents'
(4 answers)
Closed 6 months ago.
I have a Discord Bot for UKHotDeals, but it throws an error.
This is written for Python 3.x.
The original repository can be find in here: https://github.com/davidteather/Hotukdeals-Discord-Notifier
Traceback (most recent call last):
File "C:\Users\USER\Desktop\Hotukdeals-Discord-Notifier-master\main.py", line 179, in <module>
client = MyClient(channel_id)
File "C:\Users\USER\Desktop\Hotukdeals-Discord-Notifier-master\main.py", line 31, in __init__
super().__init__(*args, **kwargs)
TypeError: Client.__init__() missing 1 required keyword-only argument: 'intents'
I can't get where I'm missing something in the code, which is this:
import discord
import asyncio
import requests
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import json
with open('settings.json') as data:
settings = json.load(data)
min_upvotes = int(settings["min_upvotes"])
max_upvotes = int(settings["max_upvotes"])
base_url = settings["base_url"]
pages_to_index = int(settings["pages_to_index"])
discord_api_key = settings["discord_api_token"]
min_price = float(settings["min_price"])
max_price = float(settings["max_price"])
channel_id = int(settings["discord_channel_id"])
time_interval_seconds = int(settings["time_interval_seconds"])
class MyClient(discord.Client):
def __init__(self, channel, *args, **kwargs):
self.outOfStock = []
self.checkUrls = []
self.channelID = channel
super().__init__(*args, **kwargs)
# create the background task and run it in the background
self.bg_task = self.loop.create_task(self.my_background_task())
# Check deals
def checkDealsBeautifulSoup(self, url):
# Imports
import requests
from bs4 import BeautifulSoup
import json
import random
# Loads JSON and vars
with open('settings.json') as data:
settings = json.load(data)
min_upvotes = int(settings["min_upvotes"])
max_upvotes = int(settings["max_upvotes"])
min_price = float(settings["min_price"])
max_price = float(settings["max_price"])
# Loads proxies
with open('proxies.txt', 'r') as proxies:
proxies = proxies.readlines()
# Picks random proxy
proxy = random.choice(proxies)
returnMsgs = []
newArray = []
# Reads already used things
with open('data/usedLinks.txt', 'r') as data:
usedArray = data.readlines()
# Sets up proxy
proxies = {
"http": "http://" + proxy,
"https": "https://" + proxy,
}
page = requests.get(url, proxies=proxy)
soup = BeautifulSoup(page.text, 'html.parser')
var = False
# Tries to get things
try:
listings = soup.find_all(
'article', attrs={'data-handler': 'history'})
upvotes = soup.find_all('span', attrs={'class': 'cept-vote-temp'})
pricing = soup.find_all('span', attrs={'class': 'thread-price'})
urls = soup.find_all(
'a', attrs={'class': 'cept-thread-image-link'})
var = True
except:
var = False
if var == True:
upvotesIndex = 0
index = 0
for x in range(0, len(listings)):
try:
upvote = upvotes[upvotesIndex].text.strip().replace(
" ", "").replace("°", "").replace("\n", "")
if "Deal" in upvote or "alerts" in upvote:
upvotesIndex += 1
upvote = upvotes[upvotesIndex].text.strip().replace(
" ", "").replace("°", "").replace("\n", "")
except:
upvote = 0
try:
price = pricing[index].text.strip().replace("£", "")
except:
price = 0
try:
url = urls[index].get('href')
except:
url = None
if price != "FREE":
try:
price = float(price.replace(",", ""))
except:
price = 0
else:
price = 0
if min_price <= price <= max_price:
if min_upvotes <= int(upvote) <= max_upvotes:
if url != None:
if url + "\n" not in usedArray:
# Return Message
message = url + " Satisfies your deal criteria. It is at " + \
str(upvote) + \
" degrees and costs £" + str(price)
returnMsgs.append(message)
usedArray.append(url)
newArray.append(url)
upvotesIndex += 1
index += 1
# Saves new logged files
with open('data/usedLinks.txt', 'a') as fileObj:
for line in newArray:
fileObj.write(line + "\n")
# Returns stuff
return returnMsgs
# On start
async def on_ready(self):
print('Logged in as')
print(self.user.name)
print(self.user.id)
print('------')
# On message
async def on_message(self, message):
if message.author.id == self.user.id:
return
# Background manager
async def my_background_task(self):
await self.wait_until_ready()
channel = self.get_channel(int(channel_id))
while not self.is_closed():
for page in range(0, int(pages_to_index)):
print('checking page ' + str(page))
res = self.checkDealsBeautifulSoup(
base_url + "?page=" + str(page))
if res != []:
for msg in res:
await channel.send(msg)
await asyncio.sleep(int(time_interval_seconds))
# Main
client = MyClient(channel_id)
client.run(discord_api_key)
channel_id and discord_api_key correctly set in settings.json like this:
{
"min_upvotes": "500",
"max_upvotes": "1000",
"base_url": "https://www.hotukdeals.com",
"pages_to_index": "10",
"discord_api_token": "asdAxNasdDkxNzQ1NDcasdasd4ODU1OTAxOQ.GxasdNr.Hasdv7k9Iladsdvasd67jasdasdCXHF4",
"min_price": "0",
"max_price": "500",
"discord_channel_id": "5712311231233167",
"time_interval_seconds": "1800"
}
Looking at this other thread, the discord client now uses an Intent object in its constructor
client = discord.Client(intents=discord.Intents.default())
For you, you would have to fix the following call when instantiating your own instance
It seems to be that the error comes from discord.Client,
here:
class MyClient(discord.Client):
def __init__(self, channel, *args, **kwargs):
self.outOfStock = []
self.checkUrls = []
self.channelID = channel
super().__init__(*args, **kwargs)
check this:
https://discordpy.readthedocs.io/en/stable/api.html#discord.Client.intents
https://discordpy.readthedocs.io/en/stable/api.html#discord.Intents
I am trying to build a website change monitor and would like the below function to print the text added to the url (if and when the change is published to the website).
I can't figure out why instead of printing the added text, it returns "<generator object Differ.compare at 0x108a62c00>"
Thanks for your help!
from bs4 import BeautifulSoup
import requests
import difflib
import time
from datetime import datetime
def getContent(url):
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
return doc
def monitorUrl(url):
PrevVersion = ""
FirstRun = True
while True:
monitoredContent = getContent(url)
if PrevVersion != monitoredContent:
if FirstRun == True:
PrevVersion = monitoredContent
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
OldPage = PrevVersion
NewPage = monitoredContent
d = difflib.Differ()
diff = d.compare(OldPage, NewPage)
print(diff)
OldPage = NewPage
PrevVersion = monitoredContent
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(10)
continue
I am trying to run a multithreaded email checker to see if the emails are office 365 valid.
Looking over and over my code, I cannot seem to find the reason it's not working correctly.
It should be appending the email to a GOOD or BAD list.
Instead, it's not appending anything!
This is my code:
...
currentDirectory = os.getcwd() # set the current directory - /new/
# Locations
location_emails_goods = currentDirectory + '/contacts/goods/'
location_emails_bads = currentDirectory + '/contacts/bads/'
location_emails = currentDirectory + '/contacts/contacts.txt'
now = datetime.now()
todayString = now.strftime('%d-%m-%Y-%H-%M-%S')
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
def saveLogs():
global GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY, file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS
#print(GOOD_EMAILS_ARRAY)
for good in GOOD_EMAILS_ARRAY:
file_goods.write(good + '\n')
file_goods.close()
for bad in BAD_EMAILS_ARRAY:
file_bads.write(bad + '\n')
file_bads.close()
def newChecker(email):
global url, GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
BAD_EMAILS_ARRAY.append(email)
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
BAD_EMAILS_ARRAY.append(email)
# The follow is showing empty array eventhough I have defined GOOD_EMAILS_ARRAY globally so it should be updating
print(GOOD_EMAILS_ARRAY)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
As you can see, I am trying to append an email to either GOOD_EMAILS_ARRAY or BAD_EMAILS_ARRAY.
The BAD_EMAILS_ARRAY and GOOD_EMAILS_ARRAY are global variables but it for reason won't append to them.
I am running this through multiprocessing if you need to know.
Any ideas or errors looking in my code?
Processes do not share memory, the global variable with same name in two processes are two different object.
If you need share state between processes, see this:
https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
Okay so it turns out that I just needed to use the Manager from multiprocessing:
from multiprocessing import Manager, Pool
then I could use a normal array through the manager such as:
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
This allowed me to then use my script like it was, just using these new arrays by Manager which works just how I wanted :)
...
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
def saveLogs():
global file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS, good_list, bad_list
for good in good_list:
file_goods.write(good + '\n')
file_goods.close()
for bad in bad_list:
file_bads.write(bad + '\n')
file_bads.close()
print('{} => Fully completed email scanning'.format(Fore.CYAN))
print('{} => Good emails [{}] || Bad emails [{}]'.format(Fore.GREEN, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS))
def newChecker(email):
global url, good_list, bad_list
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
bad_list.append(email)
if valid:
good_list.append(email)
else:
if valid:
good_list.append(email)
else:
bad_list.append(email)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
I have been trying to make a bot that searches for a specific keyword in the reddit title, if that keyword is true it would then comment something in that thread. Everything works find, just I have one problem, after around 4 hours of it running it keeps searching but it stops commenting for some reason, no idea why. Then I restart it and it works ok.
It seems to happen around 3pm PST everyday, it keeps on printing that it is searching but it just wont comment even if there are posts that contain the keywords. Is this somehting that reddit does to stop bots or is something wrong with my code.
Before I had the reddit praw statement ouside of my 3 subreddit functions, but I wanted to test if i kept on reconnecting to the prawl after every seach would it stop the issue.
In sort my reddit bot stops commenting after a certain point and is there any way I could fix this or is it permanent.
#!/usr/bin/python
import praw
import pdb
import re
import os
import threading
import time
sub1_array = ['Title']
sub1_array = ['Comment']
def sub1():
reddit = praw.Reddit('bot1')
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
subreddit = reddit.subreddit('sub1')
print("Checking sub1")
for submission in subreddit.new(limit=20):
i = 0
while i <= (len(sub1_array) - 1):
# If we haven't replied to this post before
if submission.id not in posts_replied_to:
# Do a case insensitive search
if re.search(sub1_array[i], submission.title, re.IGNORECASE):
# Reply to the post
submission.reply(link_array[i])
print("Bot replying to match: ", submission.title)
del sub1_array[i]
del sub1_array[i]
posts_replied_to.append(submission.id)
time.sleep(100)
else:
i += 1
else:
i += 1
with open("posts_replied_to.txt", "w") as f:
for post_id in posts_replied_to:
f.write(post_id + "\n")
sub2_array = ['Title']
sub2_link = ['Comment]
def sub2():
reddit = praw.Reddit('bot1')
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
subreddit = reddit.subreddit('sub2')
print("Checking Streams NBA")
for submission in subreddit.new(limit=20):
#print(submission.title)
i = 0
while i <= (len(sub2_array) - 1):
# If we haven't replied to this post before
if submission.id not in posts_replied_to:
# Do a case insensitive search
if re.search(sub2_array[i], submission.title, re.IGNORECASE):
# Reply to the post
submission.reply(sub2_link[i])
print("Bot replying to match: ", submission.title)
del sub2_array[i]
del sub2_array[i]
posts_replied_to.append(submission.id)
time.sleep(100)
else:
i += 1
else:
i += 1
with open("posts_replied_to.txt", "w") as f:
for post_id in posts_replied_to:
f.write(post_id + "\n")
sub3_array = ['Title']
sub3_link = ['Comment]
def ncaa():
reddit = praw.Reddit('bot1')
if not os.path.isfile("posts_replied_to.txt"):
posts_replied_to = []
else:
with open("posts_replied_to.txt", "r") as f:
posts_replied_to = f.read()
posts_replied_to = posts_replied_to.split("\n")
posts_replied_to = list(filter(None, posts_replied_to))
subreddit = reddit.subreddit('sub3')
print("Checking sub3")
for submission in subreddit.new(limit=20):
#print(submission.title)
i = 0
while i <= (len(sub3_array) - 1):
# If we haven't replied to this post before
if submission.id not in posts_replied_to:
# Do a case insensitive search
if re.search(sub3_array[i], submission.title, re.IGNORECASE):
# Reply to the post
submission.reply(sub3_link[i])
print("Bot replying to match: ", submission.title)
del sub3_array[i]
del sub3_array[i]
posts_replied_to.append(submission.id)
time.sleep(100)
else:
i += 1
else:
i += 1
with open("posts_replied_to.txt", "w") as f:
for post_id in posts_replied_to:
f.write(post_id + "\n")
def should_reset_timer():
pass
def main():
sub1()
sub2()
sub3()
timer = 0
while True:
time.sleep(1)
timer+=1
if should_reset_timer():
timer = 0
if timer == 1*30:
sub1()
sub2()
sub3()
timer = 0
# Store the current id into our list
# Write our updated list back to the file
main()
Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)