I'm trying to create a script using python applying multiprocessing within it to fetch the link of different users from a webpage. Although the link of the users are available in it's landing page, I'm trying to dig them out from their inner pages. However, when I use yield within get_links() function and print() within get_target_link(), I can get the results as expected.
My question is: how can I achieve the same using yield within both of the functions?
I've tried:
import requests
import concurrent.futures
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def get_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".summary .question-hyperlink"):
yield urljoin(base,item.get("href"))
def get_target_link(targeturl):
res = requests.get(targeturl)
soup = BeautifulSoup(res.text,"lxml")
name_link = urljoin(base,soup.select_one(".user-details > a").get("href"))
yield name_link
if __name__ == '__main__':
base = 'https://stackoverflow.com'
mlink = "https://stackoverflow.com/questions/tagged/web-scraping"
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(get_target_link, url): url for url in get_links(mlink)}
concurrent.futures.as_completed(future_to_url)
The above script produces no result at all.
A few problems with your initial approach that causing "no result at all":
BeautifulSoup(res.text,"lxml") - change parser to html.parser (you are parsing html web-pages)
there's no benefit of making the function get_target_link as generator cause it's not supposed to become an iterator and it already produces out a single result at once.
concurrent.futures.as_completed returns an iterator over the Future instances, not the final result
The corrected approach would look as below:
import requests
import concurrent.futures as futures
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def get_links(url):
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
for link in soup.select(".summary .question-hyperlink"):
yield urljoin(base, link.get("href"))
def get_target_link(target_url):
res = requests.get(target_url)
soup = BeautifulSoup(res.text, "html.parser")
name_link = urljoin(base, soup.select_one(".user-details a").get("href"))
return name_link
if __name__ == '__main__':
base = 'https://stackoverflow.com'
mlink = "https://stackoverflow.com/questions/tagged/web-scraping"
with futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(get_target_link, url): url for url in get_links(mlink)}
for future in futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as ex:
print(f'Failed to extract user details from url: {url}')
else:
print(data)
The output:
https://stackoverflow.com/users/10035985/andrej-kesely
https://stackoverflow.com/users/11520568/rachit-gupta
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/10664939/logan-anderson
https://stackoverflow.com/users/688393/c%c3%a9sar
https://stackoverflow.com/users/903061/gregor
https://stackoverflow.com/users/9950503/saraherceg
https://stackoverflow.com/users/80851/gmile
https://stackoverflow.com/users/11793150/saurabh-rawat
https://stackoverflow.com/users/11793061/xzatar
https://stackoverflow.com/users/11759292/rachel9866
https://stackoverflow.com/users/2628114/user2628114
https://stackoverflow.com/users/9810397/bart
https://stackoverflow.com/users/838355/ir2pid
https://stackoverflow.com/users/10629482/shreya
https://stackoverflow.com/users/11669928/thor-is
https://stackoverflow.com/users/7660288/acro2142
https://stackoverflow.com/users/3342430/freddiev4
https://stackoverflow.com/users/11767045/k-%c3%96sterlund
https://stackoverflow.com/users/11781213/mohamed-shire
https://stackoverflow.com/users/5412619/a-nonymous
https://stackoverflow.com/users/4354477/forcebru
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/6622587/eyllanesc
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/3273177/casabonita
https://stackoverflow.com/users/1540328/dipesh-parmar
https://stackoverflow.com/users/6231957/perth
https://stackoverflow.com/users/11400264/workin-4weekend
https://stackoverflow.com/users/1000551/vadim-kotov
https://stackoverflow.com/users/331508/brock-adams
https://stackoverflow.com/users/11300154/helloworld1990
https://stackoverflow.com/users/11786268/mohsine-jirou
https://stackoverflow.com/users/9707561/fatima-tt
https://stackoverflow.com/users/11759292/rachel9866
https://stackoverflow.com/users/6622587/eyllanesc
https://stackoverflow.com/users/11485683/titan
https://stackoverflow.com/users/11593630/supek
https://stackoverflow.com/users/11717116/raja-kishore-patnayakuni
https://stackoverflow.com/users/975887/madushan
https://stackoverflow.com/users/10568531/robots-txt
https://stackoverflow.com/users/283366/phil
https://stackoverflow.com/users/8677101/bpdesilva
https://stackoverflow.com/users/3504096/programmerper
https://stackoverflow.com/users/6303216/akhlaq-ahmed
https://stackoverflow.com/users/11457578/sh-student
https://stackoverflow.com/users/11783947/alexis-cruz-cruz
https://stackoverflow.com/users/3579212/adnanmuttaleb
https://stackoverflow.com/users/1060350/anony-mousse
https://stackoverflow.com/users/8100732/khadija-saeed
Related
I need to download all the files from this page :
https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000&parentFilter=1433&childFilter=1433%7C1450&startMonth=1&startYear=2008&endMonth=6&endYear=2021
that have "Auction of" on their titles. This is the source for one of the files for example:
Auction of £2,500 million of 0 5/8% Treasury Gilt 2035
I am trying to adapt some code I found from another question, but the pages are coming back empty:
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_pgn(task):
session, url, destination_path = task
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
game_url = host + soup.find("a", text="download").get("href")
filename = re.search(r"\w+\.pgn", game_url).group()
path = os.path.join(destination_path, filename)
response = session.get(game_url, stream=True)
response.raise_for_status()
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if __name__ == "__main__":
destination_path = "pgns"
max_workers = 8
if not os.path.exists(destination_path):
os.makedirs(destination_path)
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
pages = soup.find_all("a", href=re.compile(r".*Auction of\?.*"))
tasks = [
(session, host + page.get("href"), destination_path)
for page in pages
]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
pool.map(download_pgn, tasks)
Check your regular expression syntax. The regex r".*Auction of\?.*" will only match titles with an actual of? in the title.
But the href= parameter will search against the URL in the link, so that won't help you much either. This will find the links with the matching titles:
links = soup.find_all("a", string=re.compile(r"Auction of\b"))
And this will extract their URLs so you can retrieve them:
[ file["href"] for file in links ]
This is what ended up working for me:
from bs4 import BeautifulSoup
import requests
import re
links = []
url = 'https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000000&parentFilter=1433&childFilter=1433|1450&startMonth=1&startYear=2000&endMonth=6&endYear=2021'
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
for a in soup.find_all("a",{"aria-label":re.compile(r"^Auction of\b")}, href=True):
links.append(a['href'])
def download_file(url):
path = url.split('/')[-1].split('?')[0]
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
host = 'https://www.dmo.gov.uk/'
for link in links:
url = host + link
download_file(url)
The find_all() method accepts a function. You can create a lambda function to filter all for a tags that contain "Auction of":
for tag in soup.find_all(lambda t: t.name == "a" and "Auction of" in t):
print(tag.text)
Or, you can use an [attribute*=value]:
# Find all `aria-label` attributes under an `a` that contain `Auction of`
for tag in soup.select("a[aria-label*='Auction of']"):
print(tag.text)
I'm trying to create a script using concurrent.futures to make the execution faster. The site link that I've used within this script is a placeholder but the logic is the same.
I'm trying to let the script parse the target links from it's landing page and then use those newly scraped links to fetch the required information from their inner pages. There is a pagination button in the landing page which leads to the next pages. FYI, there is no highest page number associated with the next page button, so I've to stick with next page link like the way I've shown below.
The way the following script is going for the next pages is slowing the process down.
Here is what I've tried so far with:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def get_links(link):
res = requests.get(link)
soup = BeautifulSoup(res.text,"html.parser")
for item in soup.select(".summary .question-hyperlink"):
target_link = base.format(item.get("href"))
yield target_link
next_page = soup.select_one("a[rel='next']:contains('Next')")
if next_page:
next_page_link = base.format(next_page.get("href"))
yield from get_links(next_page_link)
def get_info(target_link):
res = requests.get(target_link)
soup = BeautifulSoup(res.text,"html.parser")
title = soup.select_one("h1[itemprop='name'] > a").get_text(strip=True)
user_name = soup.select_one(".user-details[itemprop='author'] > a").get_text(strip=True)
return user_name,title
if __name__ == '__main__':
base = "https://stackoverflow.com{}"
url = "https://stackoverflow.com/questions/tagged/web-scraping"
with ThreadPoolExecutor(max_workers=6) as executor:
for r in [executor.submit(get_info, item) for item in get_links(url)]:
print(r.result())
What type of change should I bring about within the script to make it run faster?
There are several changes you can make:
Increase the thread pool size.
Use a requests Session object since you are making all your GET requests to the same website.
Instead of creating a list of Future instances, you will start getting output immediately if you use a generator expression.
I have also added code to test for the presence or not of expected fields. It turns out that after some number of requests, StackOverflow stops honoring the requests with a "Too Many Requests" error page and you get no useful data back as I painfully found out. So I suspect your goal of increasing speed wil lbe ultimately hampered by this.
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from functools import partial
def get_links(session, link):
res = session.get(link)
soup = BeautifulSoup(res.text,"html.parser")
for item in soup.select(".summary .question-hyperlink"):
target_link = base.format(item.get("href"))
yield target_link
next_page = soup.select_one("a[rel='next']:contains('Next')")
if next_page:
next_page_link = base.format(next_page.get("href"))
yield from get_links(session, next_page_link)
def get_info(session, target_link):
res = session.get(target_link)
soup = BeautifulSoup(res.text,"html.parser")
#title = soup.select_one("h1[itemprop='name'] > a").get_text(strip=True)
title_elem = soup.select_one("h1[itemprop='name'] > a")
title = title_elem.get_text(strip=True) if title_elem else 'title is missing'
#user_name = soup.select_one(".user-details[itemprop='author'] > a").get_text(strip=True)
user_name_elem = soup.select_one(".user-details[itemprop='author'] > a")
user_name = user_name_elem.get_text(strip=True) if user_name_elem else 'user name is missing'
return user_name,title
if __name__ == '__main__':
base = "https://stackoverflow.com{}"
url = "https://stackoverflow.com/questions/tagged/web-scraping"
with requests.Session() as session:
with ThreadPoolExecutor(max_workers=50) as executor:
for r in (executor.submit(partial(get_info, session), item) for item in get_links(session, url)):
print(r.result())
I've created a script using concurrent.futures library to print the result from fetch_links function. When I use print statement inside the function, I get the results accordingly. What I wish to do now is print the result from that function using yield statement.
Is there any way I can modify things under main function in order to print the result from fetch_links function keeping it as is, meaning keeping the yield statement?
import requests
from bs4 import BeautifulSoup
import concurrent.futures as cf
links = [
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=2&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=3&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=4&pagesize=50"
]
base = 'https://stackoverflow.com{}'
def fetch_links(s,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".summary .question-hyperlink"):
# print(base.format(item.get("href")))
yield base.format(item.get("href"))
if __name__ == '__main__':
with requests.Session() as s:
with cf.ThreadPoolExecutor(max_workers=5) as exe:
future_to_url = {exe.submit(fetch_links,s,url): url for url in links}
cf.as_completed(future_to_url)
Your fetch_links is a generator, so you have to loop over that too, to get the results:
import requests
from bs4 import BeautifulSoup
import concurrent.futures as cf
links = [
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=2&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=3&pagesize=50",
"https://stackoverflow.com/questions/tagged/web-scraping?tab=newest&page=4&pagesize=50"
]
base = 'https://stackoverflow.com{}'
def fetch_links(s, link):
r = s.get(link)
soup = BeautifulSoup(r.text, "lxml")
for item in soup.select(".summary .question-hyperlink"):
yield base.format(item.get("href"))
if __name__ == '__main__':
with requests.Session() as s:
with cf.ThreadPoolExecutor(max_workers=5) as exe:
future_to_url = {exe.submit(fetch_links, s, url): url for url in links}
for future in cf.as_completed(future_to_url):
for result in future.result():
print(result)
Output:
https://stackoverflow.com/questions/64298886/rvest-webscraping-in-r-with-form-inputs
https://stackoverflow.com/questions/64298879/is-this-site-not-suited-for-web-scraping-using-beautifulsoup
https://stackoverflow.com/questions/64297907/python-3-extract-html-data-from-sports-site
https://stackoverflow.com/questions/64297728/cant-get-the-fully-loaded-html-for-a-page-using-puppeteer
https://stackoverflow.com/questions/64296859/scrape-text-from-a-span-tag-containing-nested-span-tag-in-beautifulsoup
https://stackoverflow.com/questions/64296656/scrapy-nameerror-name-items-is-not-defined
https://stackoverflow.com/questions/64296201/missing-values-while-scraping-using-beautifulsoup-in-python
https://stackoverflow.com/questions/64296130/how-can-i-identify-the-element-containing-the-link-to-my-linkedin-profile-after
https://stackoverflow.com/questions/64295959/why-use-scrapy-or-beautifulsoup-vs-just-parsing-html-with-regex-v2
https://stackoverflow.com/questions/64295842/how-to-retreive-scrapping-data-from-web-to-json-like-format
https://stackoverflow.com/questions/64295559/how-to-iterate-through-a-supermarket-website-and-getting-the-product-name-and-pr
https://stackoverflow.com/questions/64295509/cant-stop-asyncio-request-for-some-delay
https://stackoverflow.com/questions/64295244/paginate-with-network-requests-scraper
and so on ...
I want to build the link in async function by passing arguments,from my get_daraz function.
from requests_html import AsyncHTMLSession
asession = AsyncHTMLSession()
async def get_daraz_page(keyword, page_no):
template_link = 'https://www.daraz.com.np/catalog/?_keyori=ss&from=input&page={page_no}&q={keyword}&spm=a2a0e.11779170.search.go.287d2d2bVToBsh'
r = await asession.get(template_link)
return r
def getDaraz(search):
results=asession.run(get_daraz_page(search,1))
print (results)
getDaraz("Mouse")
It Gives me the following error:
TypeError: 'coroutine' object is not callable
sys:1: RuntimeWarning: coroutine 'get_daraz_page' was never awaited
thankyou.
By requests-html, I got no links at all.
I suggest BeautifulSoup, Requests, and Json. Here is my code.
'''
Build links list.
With BeautifulSoup, Requests, and Json.
To avoid ban/ block, delay for 10 to 20 seconds randomly after requests.
'''
import requests
from bs4 import BeautifulSoup
import random
import time
import json
def delay():
# Delay for 10 to 20 seconds randomly.
sleep = random.randint(10, 20)
time.sleep(sleep)
def make_soup(url, parser):
response = requests.get(url)
delay()
data = response.text
if parser == 'html.parser':
soup = BeautifulSoup(data, 'html.parser')
else:
soup = BeautifulSoup(data, 'lxml')
return soup
def build_list():
url = 'https://www.daraz.com.np/catalog/?q=mouse&_keyori=ss&from=input&spm=a2a0e.11779170.search.go.287d2d2bd0IOUA'
parser = 'html.parser'
soup = make_soup(url, parser)
#
json_tags = soup.find_all('script', {'type': 'application/ld+json'})[1].string
json_data = json.loads(json_tags)
links = []
for item in json_data['itemListElement']:
links.append(item['url'])
print(item['url'])
if __name__ == '__main__':
build_list()
And this' the result.
https://www.daraz.com.np/products/micropack-excalibur-gaming-wired-mouse-g-860-i104220875.html
https://www.daraz.com.np/products/fantech-x15-phantom-wired-gaming-mouse-i100928540.html
https://www.daraz.com.np/products/redragon-m801-pc-gaming-mouse-led-rgb-backlit-mmo-9-programmable-buttons-mouse-with-macro-recording-side-buttons-rapid-fire-button-for-windows-computer-gamer-wired-black-i104116249.html
https://www.daraz.com.np/products/fantech-x16-gaming-mouse-4200-dpi-adjustable-optical-cable-mouse-6-button-macro-for-mouse-gamer-fps-lol-ergonomic-mouse-i103259161.html
https://www.daraz.com.np/products/fantech-t532-wired-mouse-i100928918.html
https://www.daraz.com.np/products/fantech-w188-24ghz-wireless-mouse-i100934719.html
https://www.daraz.com.np/products/fantech-x5s-zeus-computer-wired-mouse-4800-dpi-usb-optical-pc-gaming-mouse-6d-for-pclaptop-i100007184.html
https://www.daraz.com.np/products/dell-optical-mouse-black-i104330431.html
https://www.daraz.com.np/products/fantech-raigor-ii-wg10-gaming-mouse-i103261633.html
https://www.daraz.com.np/products/fantech-w189-24ghz-wireless-mouse-i100924993.html
https://www.daraz.com.np/products/jedel-6d-optical-gaming-mouse-for-computerpc-laptop-with-led-infrared-micro-6d-dpi-adjustment-i103209858.html
https://www.daraz.com.np/products/jedel-usb-optical-mouse-i100366102.html
https://www.daraz.com.np/products/generic-24ghz-1200dpi-wireless-optical-mouse-usb-rolling-car-model-mouse-for-tablet-pc-i103147712.html
https://www.daraz.com.np/products/sunsonny-s-m3s-gaming-mouse-i103218451.html
https://www.daraz.com.np/products/razer-deathadder-multi-color-ergonomic-gaming-mouse-comfortable-grip-worlds-most-popular-gaming-mouse-i104160830.html
https://www.daraz.com.np/products/logitech-b170-wireless-optical-mouse-910-004659-i14400.html
https://www.daraz.com.np/products/micropack-m101-mouse-optical-i30608.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i103237876.html
https://www.daraz.com.np/products/redragon-ranger-m910-wired-gaming-mouse-12400-dpi-i104256717.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m360-i104838908.html
https://www.daraz.com.np/products/limeidi-x1-24ghz-2400dpi-wireless-rechargeable-gaming-mouse-backlight-i103299466.html
https://www.daraz.com.np/products/24g-best-quality-wireless-optical-mouse-assorted-color-i103331286.html
https://www.daraz.com.np/products/redragon-m705-high-performance-wired-gaming-mouse-i104278047.html
https://www.daraz.com.np/products/fantech-wgc1-wireless-mouse-charging-design-rgb-and-2400dpi-adjustable-gaming-mouse-pixart-3212-game-chips-for-mouse-gamer-i103255259.html
https://www.daraz.com.np/products/jeqang-wired-usb-gaming-mouse-i101114219.html
https://www.daraz.com.np/products/dell-24g-best-quality-wireless-optical-mouse-i104032175.html
https://www.daraz.com.np/products/gloross-g501-gaming-mouse-with-mouse-pad-i101672317.html
https://www.daraz.com.np/products/fantech-hive-ux2-gaming-mouse-i104210128.html
https://www.daraz.com.np/products/r8-a6-wireless-bluetooth-charging-mouse-with-rgb-i104816388.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i104862052.html
https://www.daraz.com.np/products/lenovo-mini-optical-mouse-i100824202.html
https://www.daraz.com.np/products/jedel-gaming-mouse-gm740-original-i104798245.html
https://www.daraz.com.np/products/jedel-w450-wireless-optical-mouse-1000-dpi-i104776750.html
https://www.daraz.com.np/products/r8-1611-led-accurate-gaming-mouse-i404316.html
https://www.daraz.com.np/products/jedel-mst-1080g-2-usb-optical-mouse-black-i176104.html
https://www.daraz.com.np/products/dell-usb-optical-wired-mouse-m5111-i104546789.html
https://www.daraz.com.np/products/prolink-wireless-optical-mouse-pmw6005-i100838336.html
https://www.daraz.com.np/products/24-ghz-wireless-mouse-with-usb-20-reciever-i100680189.html
https://www.daraz.com.np/products/wiwu-wm101-bluetooth-wireless-rechargeable-mouse-i104868803.html
https://www.daraz.com.np/products/black-wireless-mouse-i112671.html
I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)