How can i get url address using Beautifulsoup - python

I'm trying to crawl url address from <a href=>
But This site's <href> is #none.
how can i crawl this url address?
I've already figured out a lot but i couldn't find tips.
like this
<a href="#none" onclick="goDetail(519975);">
title
<a>
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
ssl._create_default_https_context = ssl._create_unverified_context
html = urlopen('https://www.daegu.ac.kr/article/DG159/list')
bs = BeautifulSoup(html, 'html.parser')
nameList = bs.findAll('td', {'class': 'list_left'})
for name in nameList:
print(name.get_text())
print(name.get_url)
print('\n----------------------------------------------')

You can concatenate the id from the onclick on to a base url (which is what happens for the onclick event). The first three links (without onclick) have a different base.
from bs4 import BeautifulSoup as bs
import requests
base1 = 'https://www.daegu.ac.kr/article/DG159/detail/'
base2 = 'https://www.daegu.ac.kr/article/DG159'
r = requests.get('https://www.daegu.ac.kr/article/DG159/list')
soup = bs(r.content, 'lxml')
links = [base1 + a['onclick'].split('(')[1].split(')')[0] if a.has_attr('onclick') else base2 + a['href'] for a in soup.select('.board_tbl_list a')]
print(links)

Related

Beautiful soup returns random elements when called for a certain tag or class name

So from the following site, I need to filter the current price of the product listed which is located in the tag with the class name - current price, so I wrote the following code to get the result, but what I get is the price with some other bunch of stuffs, how to filter the price alone from the html code ?
https://www.tendercuts.in/chicken
here is the code i used :
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.tendercuts.in/'
r = requests.get('https://www.tendercuts.in/chicken')
soup = BeautifulSoup(r.content, 'lxml')
productweight = soup.find_all('p', class_='currentprice')
print(productweight)
You need to use the correct class as follows:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.tendercuts.in/chicken')
soup = BeautifulSoup(r.content, 'lxml')
for p in soup.find_all('p', class_='current-price'):
print(p.text)
Here you go...there are two changes you have to make in your code. Check the following code and comment.
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.tendercuts.in/'
r = requests.get('https://www.tendercuts.in/chicken')
soup = BeautifulSoup(r.content, 'html.parser')
productweight = soup.find_all('p', class_='current-price')
print(productweight)

How do I exclude certain beautifulsoup results that I don't want?

I am having issues trying to exclude results given from my beautiful soup program this is my code:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
I don't want to get the results that start with a "#" for example: #cite_ref-18
I have tried using for loops but I get this error message: KeyError: 0
You can use the str.startswith() method:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for tag in soup.find_all('a'):
link = tag.get('href')
if not str(link).startswith('#'):
print(link)
You can use CSS selector a[href]:not([href^="#"]). This will select all <a> tags with href= attribute but not the ones starting with # character:
import requests
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.select('a[href]:not([href^="#"])'):
print(link['href'])

Scraping tracks' links from YouTube playlist with Beautiful Soup

I am trying to scrape all links for tracks from my playlist.
This is my code
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
playlist = 'minimal_house'
url = 'https://www.youtube.com/channel/UCt2GxiTBN_RiE-cbP0cmk5Q/playlists'
html = urlopen(url)
soup = BeautifulSoup(html , 'html.parser')
tracks = soup.find(title = playlist).get('href')
print(tracks)
url = url + tracks
print(url)
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a',attrs={'class':'yt-simple-endpoint style-scope ytd-playlist-panel-video-renderer'})
print(links)
I couldn't scrape 'a'; nor by id; nor by class name.
it's my messy code that works for me:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
playlist = 'minimal_house'
url = 'https://www.youtube.com/channel/UCt2GxiTBN_RiE-cbP0cmk5Q/playlists'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
tracks = soup.find('a', attrs={'title': playlist}).get('href')
print(tracks)
url = 'https://www.youtube.com' + str(tracks)
print(url)
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
links = set([link.get('href') for link in links if link.get('href').count('watch')])
print(links)
since class names change base on device request, it's better to get all links in this case.
and you need to use selenium to scroll down to fetch all the list.

locating child element by BeautifulSoup

I am new to BeautifulSoup and I am praticing with little tasks. Here I try to get the "previous" link in this site. The html is
here
My code is
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.find('div', id="comic")
url2 = result.find('ul', class_='comicNav').find('a', rel='prev').find('href')
But it shows NoneType.. I have read some posts about the child elements in html, and I tried some different things. But it still does not work.. Thank you for your help in advance.
Tou could use a CSS Selector instead.
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.select('.comicNav a[rel~="prev"]')[0]
print(result)
if you want just the href change
result = soup.select('.comicNav a[rel~="prev"]')[0]["href"]
To get prev link.find ul tag and then find a tag. Try below code.
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
url2 = soup.find('ul', class_='comicNav').find('a',rel='prev')['href']
print(url2)
Output:
/2254/

Extracting links from html from the link of the following website

I want to extract the link
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
from the html of the page
http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05
The following is the code that is used
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
soup = BeautifulSoup(html.text,'html.parser')
link = soup.find_all('a')
print(link)
using beautiful soup. How would I go about it, using find_all('a") doesn't return the required link in the returned html.
Please try this to get Exact Url you want.
import bs4 as bs
import requests
import re
sauce = requests.get('https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=CHC&durationType=Y&Year=2018')
soup = bs.BeautifulSoup(sauce.text, 'html.parser')
for a in soup.find_all('a', href=re.compile("company_info")):
# print(a['href'])
if 'pageno' in a['href']:
print(a['href'])
output:
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=3&next=0&durationType=Y&Year=2018&duration=1&news_type=
You just have to use the get method to find the href attribute:
from bs4 import BeautifulSoup as soup
import requests
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
page= soup(html.text,'html.parser')
link = page.find_all('a')
for l in link:
print(l.get('href'))

Categories