Looping URLs for scraping on BeautifulSoup - python

My script currently looks at a list of 5 URLs, once it reaches the end of the list it stops scraping. I want it to loop back to the first URL after it completes the last URL. How would I achieve that?
The reason I want it to loop is to monitor for any changes in the product such as the price etc.
I tried looking at a few method I found online but couldn't figure it out as I am new to this. Hope you can help!
import requests
import lxml.html
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from dhooks import Webhook, Embed
import random
ua = UserAgent()
header = {'User-Agent':ua.chrome}
# Proxies
proxy_list = []
for line in open('proxies.txt', 'r'):
line = line.replace('\n', '')
proxy_list.append(line)
def get_proxy():
proxy = random.choice(proxy_list)
proxies = {
"http": f'{str(proxy)}',
"https": f'{str(proxy)}'
}
return proxies
# Opening URL file
with open('urls.txt','r') as file:
for url in file.readlines():
proxies = get_proxy()
result = requests.get(url.strip() ,headers=header,timeout=4,proxies=proxies)
#src = result.content
soup = BeautifulSoup(result.content, 'lxml')

You can store the urls in a list and do a while loop over it, the basic logic will be
with open('urls.txt','r') as file:
url_list = file.readlines()
pos = 0
while True:
if pos >= len(url_list):
pos = 0
url = url_list[pos]
pos += 1
*** rest of your logic ***

You can add a while True: loop outside and above your main with statement & for loop (and add one level of indent to every line inside). This way the program will keep running until terminated by user.

Related

How to check if the specific text on a website changed using python script

I'm trying to write a python script to check the status's display text for a specific country (ie. Ecuador)
on this website:
https://immi.homeaffairs.gov.au/what-we-do/whm-program/status-of-country-caps.
How do I keep track on that specific text when a change happens?
Currently, I tried to compare the hash codes after a time delay interval however the hash code seems to change every time even though nothing change visually.
input_website = 'https://immi.homeaffairs.gov.au/what-we-do/whm-program/status-of-country-caps'
time_delay = 60
#Monitor the website
def monitor_website():
# Run the loop the keep monitoring
while True:
# Visit the website to know if it is up
status = urllib.request.urlopen(input_website).getcode()
# If it returns 200, the website is up
if status != 200:
# Call email function
send_email("The website is DOWN")
else:
send_email("The website is UP")
# Open url and create the hash code
response = urllib.request.urlopen(input_website).read()
current_hash = hashlib.sha224(response).hexdigest()
# Revisit the website after time delay
time.sleep(time_delay)
# Visit the website after delay, and generate the new website
response = urllib.request.urlopen(input_website).read()
new_hash = hashlib.sha224(response).hexdigest()
# Check the hash codes
if new_hash != current_hash:
send_email("The website CHANGED")
Can you check it using Beautiful Soup?
Crawl the page for "Ecuador" and then check the next word for "suspended**"
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = "https://immi.homeaffairs.gov.au/what-we-do/whm-program/status-of-country-caps"
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
# create list of all tags 'td'
list_name = list()
tags = soup('td')
for tag in tags:
#take out whitespace and \u200b unicode
url_grab = tag.get_text().strip(u'\u200b').strip()
list_name.append(url_grab)
#Search list for Ecuador and following item in list
country_status ={}
for i in range(len(list_name)):
if "Ecuador" in list_name[i]:
country_status[list_name[i]] = list_name[i+1]
print(country_status)
else:
continue
#Check website
if country_status["Ecuador"] != "suspended**":
print("Website has changed")

Unable to pass proxies and links to the threadpool to get results

I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text

Removing all characters after URL?

Basically, I'm trying to remove all the characters after the URL extension in a URL, but it's proving difficult. The application works off a list of various URLs with various extensions.
Here's my source:
import requests
from bs4 import BeautifulSoup
from time import sleep
#takes userinput for path of panels they want tested
import_file_path = input('Enter the path of the websites to be tested: ')
#takes userinput for path of exported file
export_file_path = input('Enter the path of where we should export the panels to: ')
#reads imported panels
with open(import_file_path, 'r') as panels:
panel_list = []
for line in panels:
panel_list.append(line)
x = 0
for panel in panel_list:
url = requests.get(panel)
soup = BeautifulSoup(url.content, "html.parser")
forms = soup.find_all("form")
action = soup.find('form').get('action')
values = {
soup.find_all("input")[0].get("name") : "user",
soup.find_all("input")[1].get("name") : "pass"
}
print(values)
r = requests.post(action, data=values)
print(r.headers)
print(r.status_code)
print(action)
sleep(10)
x += 1
What I'm trying to achieve is an application that automatically tests your username/password from a list of URLs provided in a text document. However, BeautifulSoup returns an incomplete URL when crawling for action tags, i.e instead of returning the full http://example.com/action.php it will return action.php as it would be in the code. The only way I can think to get past this would be to restate the 'action' variable as 'panel' with all characters after the url extension removed, followed by 'action'.
Thanks!

web scraping and 403 forbidden: My web scraper is blocked by a website, what should I do to make request?

I wrote a script to pull data from a website. But after several times, it shows 403 forbidden when I request.
What should I do for this issue.
My code is below:
import requests, bs4
import csv
links = []
with open('1-432.csv', 'rb') as urls:
reader = csv.reader(urls)
for i in reader:
links.append(i[0])
info = []
nbr = 1
for url in links:
# Problem is here.
sub = []
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
start = soup.find('em')
forname = soup.find_all('b')
name = []
for b in forname:
name.append(b.text)
name = name[7]
sub.append(name.encode('utf-8'))
for b in start.find_next_siblings('b'):
if b.text in ('Category:', 'Website:', 'Email:', 'Phone' ):
sub.append(b.next_sibling.strip().encode('utf-8'))
info.append(sub)
print('Page ' + str(nbr) + ' is saved')
with open('Canada_info_4.csv', 'wb') as myfile:
wr = csv.writer(myfile,quoting=csv.QUOTE_ALL)
for u in info:
wr.writerow(u)
nbr += 1
what should I do to make requests to the website.
Example url is http://www.worldhospitaldirectory.com/dr-bhandare-hospital/info/43225
Thanks.
There's a bunch of different things that could be the problem, and depending on what their blacklisting policy it might be too late to fix.
At the very least, scraping like this is generally considered to be dick behavior. You're hammering their server. Try putting a time.sleep(10) inside your main loop.
Secondly, try setting your user agents. See here or here
A better solution though would be to see if they have an API you can use.

Python Multi Threading using Requests and BeautifulSoup

I'm writing a web scraper. I could've just used scrapy but decided to write it from scratch so I can practice.
I've created a scraper that works successfully using requests and BeautifulSoup. It navigates through about 135 pages with 12 items on each, grabs the link and then grabs the information from the link destination. At the end it writes everything in a CSV file. It only grabs strings and it doesn't download any images or anything like that… for now.
Problem? It's quite slow. It takes about 5 secs to grab the everything just from contents of one page so that times 135 is about 11 minutes.
So my question is how do I implement threading in my code so it gets data way faster.
Here's the code:
import requests
from bs4 import BeautifulSoup
import re
import csv
def get_actor_dict_from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
#There must be a better way to handle this, but let's assign a NULL value to all upcoming variables.
profileName = profileImage = profileHeight = profileWeight = 'NULL'
#Let's get the name and image..
profileName = str.strip(soup.find('h1').get_text())
profileImage = "http://images.host.com/actors/" + re.findall(r'\d+', url)[0] + "/actor-large.jpg"
#Now the rest of the stuff..
try:
profileHeight = soup.find('a', {"title": "Height"}).get_text()
except:
pass
try:
profileWeight = soup.find('a', {"title": "Weight"}).get_text()
except:
pass
return {
'Name': profileName,
'ImageUrl': profileImage,
'Height': profileHeight,
'Weight': profileWeight,
}
def lotta_downloads():
output = open("/tmp/export.csv", 'w', newline='')
wr = csv.DictWriter(output, ['Name','ImageUrl','Height','Weight'], delimiter=',')
wr.writeheader()
for i in range(135):
url = "http://www.host.com/actors/all-actors/name/{}/".format(i)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("div", { "class" : "card-image" })
for a in links:
for url in a.find_all('a'):
url = "http://www.host.com" + url['href']
print(url)
response = requests.get(url)
html = response.content
actor_dict = get_actor_dict_from_html(url, html)
wr.writerow(actor_dict)
print('All Done!')
if __name__ == "__main__":
lotta_downloads()
Thanks!
Why don't you try to use gevent library?
gevent library has monkey patch making blocking function to non-blocking function.
Maybe wait time of requests is too much and so slow.
So I think that Making request as non-blocking function make your program fast.
On python 2.7.10
example:
import gevent
from gevent import monkey; monkey.patch_all() # Fix import code
import reqeusts
actor_dict_list = []
def worker(url):
content = requests.get(url).content
bs4.BeautifulSoup(content)
links = soup.find_all('div', {'class': 'card-image'})
for a in links:
for url in a.find_all('a'):
response = requests.get(url) # You can also use gevent spawn function on this line
...
actor_dict_list.append(get_actor_dict_from_html(url, html)) # Because of preventing race condition
output = open("/tmp/export.csv", "w", newline='')
wr = csv.DictWriter(output, ['Name', 'ImageUrl', 'Height', 'Weight'], delimiter=',')
wr.writeheader()
urls = ["http://www.host.com/actors/all-actors/name/{}/".format(i) for i in range(135)]
jobs = [gevent.spawn(worker, url) for url in urls]
gevent.joinall(jobs)
for i in actor_dict_list:
wr.writerow(actor_dict)
public gevent document: doc
P.S.
You must install python-gevent If you have ubuntu OS
sudo apt-get install python-gevent

Categories