Python difflib - generator object / not expected output - python

I am trying to build a website change monitor and would like the below function to print the text added to the url (if and when the change is published to the website).
I can't figure out why instead of printing the added text, it returns "<generator object Differ.compare at 0x108a62c00>"
Thanks for your help!
from bs4 import BeautifulSoup
import requests
import difflib
import time
from datetime import datetime
def getContent(url):
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
return doc
def monitorUrl(url):
PrevVersion = ""
FirstRun = True
while True:
monitoredContent = getContent(url)
if PrevVersion != monitoredContent:
if FirstRun == True:
PrevVersion = monitoredContent
FirstRun = False
print ("Start Monitoring "+url+ ""+ str(datetime.now()))
else:
print ("Changes detected at: "+ str(datetime.now()))
OldPage = PrevVersion
NewPage = monitoredContent
d = difflib.Differ()
diff = d.compare(OldPage, NewPage)
print(diff)
OldPage = NewPage
PrevVersion = monitoredContent
else:
print( "No Changes "+ str(datetime.now()))
time.sleep(10)
continue

Related

How can I add threading on my Python code?

Below is my try to create a username availability checker with proxies, so far it works as intended
the only thing is that its slow, i tried to implement threads but no different as im not sure if im doing it right or not.
used concurrent.futures and threading libraries.
Is there a better way to code this kind of programs or are there any other suggestions?
Thanks in advance
import requests
import json
import ctypes
import colorama
from colorama import Fore
from datetime import datetime
import os
os.system("cls")
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
colorama.init()
url = "https://link"
def grab_proxies():
proxylist = []
prx = open('proxy.txt','r')
prx = prx.readlines()
for proxy in prx:
proxy = proxy.rstrip("\n")
proxylist.append(proxy)
return proxylist
prlist = grab_proxies()
def grab_usernames():
userlist = []
users = open('userlist.txt','r')
users = users.readlines()
for user in users:
user = user.rstrip("\n")
userlist.append(user)
return userlist
ulist = grab_usernames()
found = 0
pc = 0
uc = 0
for i in range(0,len(prlist)):
ctypes.windll.kernel32.SetConsoleTitleW(f"[# Checker] | Counter: %s - Found: %s - Current Proxy: %s - Started at: %s" % (i, found, prlist[pc], current_time))
try:
req = requests.post(url,headers=headers, data = {"requested_username": ulist[uc], "xsrf_token": "F0kpyvjJgeBtsOk5Gl6Jvg"},proxies={'http' : prlist[pc],'https': prlist[pc]}, timeout=2)
response = req.json()
#print(response,req.status_code)
#print(response)
#print(type(response))
if(response['reference']['status_code'] == 'TAKEN'):
#rd = response['errors']['username'][0]['code']
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Taken{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
#print(ulist[uc]+" Taken")
uc+=1
elif(response['reference']['status_code'] == 'OK'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTGREEN_EX}Available{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
#print(ulist[uc]+" Available")
f = open("found.txt","a")
f.write(ulist[uc]+"\n")
f.close()
found+=1
uc+=1
elif(response['reference']['status_code'] == 'INVALID_BEGIN'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Invalid Username{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
uc+=1
elif(response['reference']['status_code'] == 'DELETED'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Deleted{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
uc+=1
else:
print(response)
except:
#print(prlist[pc]+ " Going to next proxy")
pc+=1
pass
#break
x = input("Finished!.. press enter to exit")
You could use https://github.com/encode/requests-async to do your requests in an async way

Loop Through IP Address Range to Check Printer Status in HTML - Python & BeuatifulSoup

I'm trying to loop through a few IP addresses which are printer to make sure the status is READY and I'm not sure my code is actually looping through each one. The code should print Status: READY the first time the code runs then after every 2 mins it will check again if nothing changed then it will print "Nothing Changed" otherwise it will print the Status.
# Scale1 Ticket Printer 10.56.32.247
# Scale2 Ticket Printer 10.56.32.248
# Scale3 Ticket Printer 10.56.32.246
import sys
import requests
from bs4 import BeautifulSoup
import time
def main():
result = []
for ip in range(246, 248):
resp = requests.get(f"http://10.56.32.%d" % ip)
result.extend(resp)
txt = resp.text
soup = BeautifulSoup(txt, 'lxml')
status = soup.find_all('h3')[0].text
return status
res_before = ""
while True:
res = main()
if res != res_before:
#print(res)
res_before = res
else:
print("nothing changed")
for i in range(120):
msg = "Pausing for 2 minutes..."
sys.stdout.write("\r{} {} seconds ".format(msg, i))
time.sleep(1)
sys.stdout.flush()
The first time the code runs it should print Status: READY but it's just printing nothing changed.
Here are the results from the code. Thank you in advanced for any help it's much appreciated.
nothing changed
Pausing for 2 minutes... 119 seconds nothing changed
You need to return e.g. a dict with a key/value for each IP address.
from bs4 import BeautifulSoup
import requests
import time
def get_statuses():
results = {}
for ip in range(246, 248):
resp = requests.get(f"http://10.56.32.%d" % ip)
if resp.status_code != 200:
results[ip] = f"Error {resp.status_code}"
else:
txt = resp.text
soup = BeautifulSoup(txt, "lxml")
results[ip] = soup.find_all("h3")[0].text
return results
def main():
old_statuses = None
while True:
new_statuses = get_statuses()
if old_statuses != new_statuses:
print("Status:", new_statuses)
old_statuses = new_statuses
print("Checking again in 2 minutes.")
time.sleep(120)
if __name__ == "__main__":
main()
To print only changed statuses, you could do something like
old_statuses = {}
while True:
new_statuses = get_statuses()
for key, value in new_statuses.items():
if value != old_statuses.get(key):
print("Changed:", key, value)
old_statuses = new_statuses
And further, to give a name to each machine, make a mapping out of them.
addresses = {
"Scale1": "http://10.56.32.247/",
"Scale2": "http://10.56.32.248/",
"Scale3": "http://10.56.32.246/",
}
def get_statuses():
results = {}
for name, address in addresses.items():
resp = requests.get(address)
if resp.status_code != 200:
results[name] = f"Error {resp.status_code}"
else:
soup = BeautifulSoup(resp.text, "lxml")
results[name] = soup.find_all("h3")[0].text
return results

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!
Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

Trying to scrape data off of a website using Python and Chromedriver, but it's returning a nonetype error for "find"

I am trying to scrape data off of WhoScored.com. I am not sure what is the best way to do it or if anyone is familiar with this particular website, but I have a Python script that is supposed to scrape the data.
Here is my code:
import time
import bs4
import selenium_func as sel
from helper_functions import read_from_file, append_to_file
TIERS_PATH = 'tiers_urls/tiers_urls.txt'
TEAMS_PATH = 'teams_urls/teams_urls.txt'
TEAMS_LOGS = 'teams_urls/teams_logs.txt'
"""
Functions
"""
def get_teams_urls(start_idx):
"""
Searches each tier and extracts all the teams' urls within that tier.
"""
server, driver = sel.start_server_and_driver()
tiers_urls = read_from_file(TIERS_PATH)
length = len(tiers_urls)
for tier in tiers_urls[start_idx:]:
error = False
teams_urls = []
try:
complete_url = sel.WHOSCORED_URL + tier
try:
driver.get(complete_url)
content = driver.page_source
soup = bs4.BeautifulSoup(''.join(content), 'lxml')
except Exception as e:
print('\n')
print("Problem accessing {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError accessing: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
continue
stage = None
stages_div = soup.find('div', {'id':'sub-navigation'})
if stages_div != None:
stage_li = stages_div.find_all('li')[0]
if stage_li != None:
stage_href = stage_li.find('a', href=True)['href']
if stage_href != None:
stage = stage_href.split('/')[8]
if stage != None:
standings_table = soup.find('div', {'id':'standings-'+stage})
standings_tbody = standings_table.find(id='standings-'+stage+'-content')
teams_tr = standings_tbody.find_all('tr')
if len(teams_tr) > 0:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
except Exception as e:
print('\n')
print("Problem reading data from: {}".format(tier))
print(str(e))
print('\n')
append_to_file("\nError reading data from: " + tier + "\n", TEAMS_LOGS)
append_to_file("Index: " + str(tiers_urls.index(tier)), TEAMS_LOGS)
error = True
if error == False:
if len(teams_urls) > 0:
to_store = {tier:teams_urls}
append_to_file(str(to_store), TEAMS_PATH)
append_to_file("\nSuccessfully retrieved from: " + str(tiers_urls.index(tier)) + "/" + str(length), TEAMS_LOGS)
time.sleep(1)
sel.stop_server_and_driver(server, driver)
return
if __name__ == '__main__':
get_teams_urls(0)
I am trying to scrape data off of WhoScored.com and it opens up the website, but it returns this error:
'NoneType' object has no attribute 'find'
How do I fix this and successfully scrape the data ?
Sounds like you need some null/None-checks:
for tr in teams_tr:
team_td = tr.find_all('td')[1]
if team_td != None:
team_href = team_td.find('a', href=True)['href']
teams_urls.append(team_href)
You didn't check if team_td was None before calling find

Best way to make thousands of get requests in python

Right now I am working on a python script which takes in a list of url's as an argument, then performs a GET request on each url and then searches through the output with xpath to fingerprint the website. It seems to work like a charm when the list is around 50 sites long, but anything after that causes the program to slow down to the point where it stop (usually around 150 sites). Scroll down to where you see main app logic and the relevant code it below. Right now I am just using 50 elements in the array and it works fine, but anything after makes the entire program stop. Any suggestions would be greatly appreciated!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(#src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[#name='generator']/#content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/#href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../#href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../#href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)

Categories