Extract URL from a website including archived links - python

I'm crawling a news website to extracts all links including the archived ones which is typical of a news website. The site here has a a button View More Stories that loads more website articles. Now this code below
def find_urls():
start_url = "e.vnexpress.net/news/business"
r = requests.get("http://" + start_url)
data = r.text
soup = BeautifulSoup(data, "html.parser")
links = soup.findAll('a')
url_list = []
for url in links:
all_link = url.get('href')
if all_link.startswith('http://e.vnexpress.net/news/business'):
url_list.append(all_link)
return set(url_list)
successfully load quite a few url but how do I load more here is a snippet of the button
<a href="javascript:void(0)" id="vnexpress_folder_load_more" data-page="2"
data-cate="1003895">
View more stories
</a>
Can someone help me out. Thanks.

You can use a browser like selenium to click the button till the button disappears or disables. Finally you can scrape the entire page using beautifulsoup in one go.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
#initializing browser
driver = webdriver.Firefox()
driver.set_window_size(1120, 550)
driver.get("http://e.vnexpress.net/news/news")
# run this till button is present
elem = driver.find_element_by_id('vnexpress_folder_load_more'))
elem.click()

Related

How to scrape hyperlinks from multiple pages on webpage with unchanging URL, using Python/Beautiful Soup?

There is a paginated list of hyperlinks on this webpage: https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/.
The code I have created till now scrapes the relevant links from the first page. I cannot figure out how to extract links from subsequent pages (8 links per page, about 25 pages).
There does not seem to be a way to navigate the pages using the URL.
from bs4 import BeautifulSoup
import urllib.request
# Scrape webpage
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
# Extract links
links = []
for link in soup.find_all('a', href=True):
links.append(link['href'])
# Select relevant links, reformat, and drop duplicates
links = list(dict.fromkeys(["https://www.farmersforum.ie"+link for link in links if "/reports/Thurles" in link]))
Please advise for how I can do this using Python.
I've solved this with Selenium. Thank you.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
# Launch Chrome driver
driver = webdriver.Chrome(ChromeDriverManager().install())
# Open webpage
driver.get("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
# Loop through pages
allLnks = []
iStop = False
# Continue until fail to find button
while iStop == False:
for ii in range(2,12):
try:
# Click page
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li['+str(ii)+']/a').click()
except:
iStop = True
break
# Wait to load
time.sleep(0.1)
# Identify elements with tagname <a>
lnks=driver.find_elements_by_tag_name("a")
# Traverse list of links
iiLnks = []
for lnk in lnks:
# Use get_attribute() to get all href and add links to list
iiLnks.append(lnk.get_attribute("href"))
# Select relevant links, reformat, and drop duplicates
iiLnks = list(dict.fromkeys([iiLnk for iiLnk in iiLnks if "/reports/Thurles" in iiLnk]))
allLnks = allLnks + iiLnks
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li[12]/a').click()
driver.quit()

BeautifulSoup cannot find 'class' href

This is the page I'm trying to scrape:
https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2
I want to scrape the lisbox href next to Token label.
I need to scrape href from
class="link-hover d-flex justify-content-between align-items-center"
so my code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2').text
html = BeautifulSoup(page, 'html.parser')
href = html.find(class_ = 'link-hover d-flex justify-content-between align-items-center')['href']
however the result is nothing.
Can anyone help me?
The element of interest is rendered by JavaScript. Thus, you will need some browser automation software to render the JavaScript, in order to get the full HTML necessary.
Note: You could use requests-html which supports JavaScript rendering. However, it does use a browser automation software itself, so, in my opinion, it's best to get rid of the "middle-man".
Selenium
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2")
elem = browser.find_element_by_id("availableBalanceDropdown")
elem.click()
soup = bs4.BeautifulSoup(browser.page_content(), features="html.parser")
Playwright
from playwright.sync_api import sync_playwright
with sync_playwright() as play:
browser = play.chromium.launch()
page = browser.new_page()
page.goto("https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2")
page.click("#availableBalanceDropdown")
soup = bs4.BeautifulSoup(page.content(), features="html.parser")
browser.quit()
Once you have the bs4.BeautifulSoup object, it's just a matter of scraping for the CSS selector.
import bs4
soup = bs4.BeautifulSoup(...) # From above examples
elems = soup.select(".link-hover.d-flex.justify-content-between.align-items-center")

Extract all links from drop down list combination

I have a sample website and I want to extract all the "href links" from the website. It has two drop downs and once drop down is selected it displays results with link to manual to download.
It does not navigate to different page instead shows result on the same page. I have extracted the combination of drop down lists, I am trying to extract the manual links and I am unable to find the link.
code is as follows
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
import requests
url = "https://www.cars.com/"
driver = webdriver.Chrome('C:/Users/webdrivers/chromedriver.exe')
driver.get(url)
time.sleep(4)
selectYear = Select(driver.find_element_by_id("odl-selected-year"))
data = []
for yearOption in selectYear.options:
yearText = yearOption.text
selectYear.select_by_visible_text(yearText)
time.sleep(1)
selectModel = Select(driver.find_element_by_id("odl-selected-model"))
for modelOption in selectModel.options:
modelText = modelOption.text
selectModel.select_by_visible_text(modelText)
data.append([yearText,modelText])
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
content = soup.findAll('div',attrs={"class":"odl-results-container"})
for i in content:
x = i.findAll(['h3','span'])
for y in x:
print(y.get_text())
print does not show any data. How can I get the links for manuals? Thanks in advance
You need to click the button for each car model and year and then retrieve the rendered HTML page source from your Selenium webdriver rather than with requests.
Add this in your inner loop:
button = driver.find_element_by_link_text("Select this vehicle")
button.click()
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
content = soup.findAll('a',attrs={"class":"odl-download-link"})
for i in content:
print(i["href"])
This prints out:
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=6875&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O91668&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7126&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O134871&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7708&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O177941&VIN=&userMarket=GBR
...

Extract urls from the footer of a web page

I am trying to extract the social media links from websites for my research unfortunately, I am not able to extract them as they are located in the footer of the website.
I tried requests, urllib.request, pattern.web apis to download the html document of a webpage. All these apis download the same content and failing to download the content in the footer of the websites.
import requests
from bs4 import BeautifulSoup as soup
url = 'https://cloudsight.ai/'
headers = {'User-Agent':'Mozilla/5.0'}
sm_sites = ['https://www.twitter.com','https://www.facebook.com',
'https://www.youtube.com','https://www.linkedin.com',
'https://www.linkedin.com/company', 'https://twitter.com',
'https://facebook.com','https://youtube.com','https://linkedin.com',
'http://www.twitter.com','http://www.facebook.com',
'http://www.youtube.com','http://www.linkedin.com',
'http://www.linkedin.com/company', 'http://twitter.com',
'http://facebook.com','http://youtube.com','http://linkedin.com']
blocked = ['embed','search','sharer','intent','share','watch']
sm_sites_present = []
r = requests.get(url,headers=headers)
content = soup(r.content,'html.parser')
text = r.text
links = content.find_all('a',href=True)
for link in links:
a = link.attrs['href'].strip('/')
try:
if any(site in a for site in sm_sites) and not any(block in a for block in blocked):
sm_sites_present.append(a)
except:
sm_sites_present.append(None)
output:
>>> sm_sites_present
>>> []
If you see the website inspect element the social_media information is provided in the footer div DOM.
If you just even try text.find('footer') the result is -1.
I tried for many hours to figure out how to extract this footer information and I failed.
SO, I kindly request if anyone could help me in solving it.
Note:
Even I tried regex, the problem is the when we download the page the footer information is not being downloaded.
As suggested by #chitown88, you can use Selenium to get the content.
from selenium import webdriver
url = 'https://cloudsight.ai/'
driver = webdriver.Firefox()
driver.get(url)
html = driver.page_source
driver.quit()
soup = BeautifulSoup(html,'html.parser')
[i.a['href'] for i in soup.footer.find_all('li', {'class':'social-list__item'})]
output
['https://www.linkedin.com/company/cloudsight-inc',
'https://www.facebook.com/CloudSight',
'https://twitter.com/CloudSightAPI']

How to scrape text that only unlocks after clicking "more" button

Im trying to scrape a review from the trip advisor website. I succeed in scraping the reviews, but some reviews are long and are partially shown, until you click the "more" button.
This is the link of the website :
https://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-The_Thief-Oslo_Eastern_Norway.html#REVIEWS
This is the source code of the "more" button:
<span class= soup.findAll(attrs={"class": "entry"}):
review = item.text.replace(',', '').replace('\n', ' ').encode('utf-8').strip()
This is how i grab the reviews from the page
for item in soup.findAll(attrs={"class": "entry"}):
review = item.text.replace(',', '').replace('\n', ' ').encode('utf-8').strip()
How do i manage to scrape all the reviews after the more button is clicked?
Try loading the page in Selenium. This would allow you to interact with javascript. I haven't tried it with BeautifulSoup, but I think it would look something like this:
from selenium import webdriver
import BeautifulSoup
browser = webdriver.Firefox() #Or any other driver you want
browser.get('https://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-The_Thief-Oslo_Eastern_Norway.html#REVIEWS')
next_btn = browser.find_element_by_xpath('PATH_FOR_NEXT_LINK_ELEMENT')
next_btn.click()
html_source = browser.page_source
browser.quit()
soup = BeautifulSoup.BeautifulSoup(html_source)
review = soup("YOUR_SCRAPING_LOGIC")
When you click the More link, JavaScript code will run in the browser to get data or jump to another link, requests will return the html code, it can not handle the JavaScript.

Categories