How to scrape plaintext from multiple links off of one website? - python

from bs4 import BeautifulSoup
import bs4 as bs
import pandas as pd
import numpy as py
import json
import csv
import re
import urllib.request
sauce =
urllib.request.urlopen("https://www.imdb.com/list/ls003073623/").read()
soup = bs.BeautifulSoup(sauce, 'html.parser')
soup.findAll('a', href=re.compile('^/title/'))
I am trying to scrape multiple links off of a website (about 500) and I don't want to manually input each and every URL, how do I go about scraping this?

With BeautifulSoup
If I understand it right, you are trying to obtain a list containing a part of all the links on a given website. There is an example on BeautifulSoup's documentation that shows exactly how to do that:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen("https://www.imdb.com/list/ls003073623/")
soup = BeautifulSoup(html_page)
ids = []
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
ids.append(link.get('href').split("/")[4])
print(ids)
With Selenium
For reference, and since it doesn't seem like the question is limited to only BeautifulSoup, here's how we would do the same using Selenium, a very popular alternative.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/list/ls003073623/")
ids = []
elems = driver.find_elements_by_xpath("//a[#href]")
for elem in elems:
ids.append(elem.get_attribute("href").split("/")[4])
print(ids)

Related

Web Scraping with Beautiful Soup Python - Seesaw - The output has no length error

I am trying to get the comments from a website called Seesaw but the output has no length. What am I doing wrong?
import requests
import requests
import base64
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req
from requests import get
html_text = requests.get("https://app.seesaw.me/#/activities/class/class.93a29acf-0eef-4d4e-9d56-9648d2623171").text
soup = BeautifulSoup(html_text, "lxml")
comments = soup.find_all("span", class_ = "ng-binding")
print(comments)
Because there is no span element with class ng-binding on the page (these elements added later via JavaScript)
import requests
html_text = requests.get("https://app.seesaw.me/#/activities/class/class.93a29acf-0eef-4d4e-9d56-9648d2623171").text
print(f'{"ng-binding" in html_text=}')
So output is:
"ng-binding" in html_text=False
Also you can check it using "View Page Source" function in your browser. You can try to use Selenium for automate interaction with the site.

Extract URL in Sitemap With Python

I need the extract links in sitemap
https://wunder.com.tr/sitemap.xml
I wrote some code
import requests
from bs4 import BeautifulSoup
wunder = requests.get("https://wunder.com.tr/sitemap.xml")
parcala = BeautifulSoup(wunder.content,"lxml")
links = parcala.find_all("html-tag")
print(links)
But unable to extract.
import requests
from bs4 import BeautifulSoup
wunder = requests.get("https://wunder.com.tr/sitemap.xml")
parcala = BeautifulSoup(wunder.content, "xml")
urls_from_xml = []
loc_tags = parcala.find_all('loc')
for loc in loc_tags:
urls_from_xml.append(loc.get_text())
print(urls_from_xml)

Neither Selenium or Beautiful soup showing full html source?

I tried using beautiful soup to parse a website, however when I printed "page_soup" I would only get a portion of the HTML, the beginning portion of the code, which has the info I need, was omitted. No one answered my question. After doing some research I tried using Selenium to access the full HTML, however I got the same results. Below are both of my attempts with selenium and beautiful soup. When I try and print the html it starts off in the middle of the source code, skipping the doctype, lang etc initial statements.
from selenium import webdriver
from bs4 import BeautifulSoup
browser = webdriver.Chrome( executable_path= "/usr/local/bin/chromedriver")
browser.get('https://coronavirusbellcurve.com/')
html = browser.page_source
soup = BeautifulSoup(html)
print(soup)
import bs4
import urllib
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
htmlPage = urlopen(pageRequest).read()
page_soup = soup(htmlPage, 'html.parser')
print(page_soup)
The requests module seems to be returning the numbers in the first table on the page assuming you are referring to US Totals.
import requests
r = requests.get('https://coronavirusbellcurve.com/').content
print(r)

Not able to find a link in a product page

I am trying to make a list of the links that are inside a product page.
I have multiple links through which I want to get the links of the product page.
I am just posting the code for a single link.
r = requests.get("https://funskoolindia.com/products.php?search=9723100")
soup = BeautifulSoup(r.content)
for a_tag in soup.find_all('a', class_='product-bg-panel', href=True):
print('href: ', a_tag['href'])
This is what it should print: https://funskoolindia.com/product_inner_page.php?product_id=1113
The site is dynamic, thus, you can use selenium
from bs4 import BeautifulSoup as soup
from selenium import webdriver
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://funskoolindia.com/products.php?search=9723100')
results = [*{i.a['href'] for i in soup(d.page_source, 'html.parser').find_all('div', {'class':'product-media light-bg'})}]
Output:
['product_inner_page.php?product_id=1113']
The data are loaded dynamically through Javascript from different URL. One solution is using selenium - that executes Javascript and load links that way.
Other solution is using re module and parse the data url manually:
import re
import requests
from bs4 import BeautifulSoup
url = 'https://funskoolindia.com/products.php?search=9723100'
data_url = 'https://funskoolindia.com/admin/load_data.php'
data = {'page':'1',
'sort_val':'new',
'product_view_val':'grid',
'show_list':'12',
'brand_id':'',
'checkboxKey': re.findall(r'var checkboxKey = "(.*?)";', requests.get(url).text)[0]}
soup = BeautifulSoup(requests.post(data_url, data=data).text, 'lxml')
for a in soup.select('#list-view .product-bg-panel > a[href]'):
print('https://funskoolindia.com/' + a['href'])
Prints:
https://funskoolindia.com/product_inner_page.php?product_id=1113
try this : print('href: ', a_tag.get("href"))
and add features="lxml" to the BeautifulSoup constructor

Using Requests and BeautifulSoup - Python returns tag with no text

I'm trying to capture the number of visits on this page, but python returns the tag with no text.
This is what I've done.
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.kijiji.ca/v-2-bedroom-apartments-condos/city-of-halifax/clayton-park-west-condo-style-luxury-2-bed-den/1016364514")
soup = BeautifulSoup(r.content)
print soup.find_all("span",{"class":"ad-visits"})
The values you are trying to scrape are populated by javascript so beautfulsoup or requests aren't going to work in this case.
You'll need to use something like selenium to get the output.
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("http://www.kijiji.ca/v-2-bedroom-apartments-condos/city-of-halifax/clayton-park-west-condo-style-luxury-2-bed-den/1016364514")
soup = BeautifulSoup(driver.page_source , 'html.parser')
print soup.find_all("span",{"class":"ad-visits"})
Selenium will return the page source as rendered and you can then use beautifulsoup to get the value
[<span class="ad-visits">385</span>]

Categories