How do I exclude certain beautifulsoup results that I don't want?

How do I exclude certain beautifulsoup results that I don't want? - python

I am having issues trying to exclude results given from my beautiful soup program this is my code:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
I don't want to get the results that start with a "#" for example: #cite_ref-18
I have tried using for loops but I get this error message: KeyError: 0

You can use the str.startswith() method:
from bs4 import BeautifulSoup
import requests
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for tag in soup.find_all('a'):
link = tag.get('href')
if not str(link).startswith('#'):
print(link)

You can use CSS selector a[href]:not([href^="#"]). This will select all <a> tags with href= attribute but not the ones starting with # character:
import requests
from bs4 import BeautifulSoup
URL = 'https://en.wikipedia.org/wiki/List_of_Wikipedia_mobile_applications'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
for link in soup.select('a[href]:not([href^="#"])'):
print(link['href'])

Related

How do I get certain links from a website, but not all of them?

Here's what I have so far:
import requests
from bs4 import BeautifulSoup
def linkScraper():
html = requests.get("https://www.bbc.com/").text
soup = BeautifulSoup(html, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
But this prints every single link on the website. How can I configure this to give me the links to the articles that appear on the BBC's homepage?

You can filter it with list comprehension:
import requests
from bs4 import BeautifulSoup
def linkScraper():
html = requests.get("https://www.bbc.com/").text
soup = BeautifulSoup(html, 'html.parser')
links = [link['href'] for link in soup.find_all('a') if link['href'].startswith('https://www.bbc.com/')]
for i in links:
print(i)

Not Getting Expected find results with BeautifulSoup

Hi I am trying to get all questions on stackoverflow page
For this i am trying below code but nothing is getting printed is there anything i can rectify here ?
from bs4 import BeautifulSoup
import requests
content = requests.get("https://stackoverflow.com/").content
soup = BeautifulSoup(content, features="html.parser")
for i in soup.find_all("a", class_="question-hyperlink", href=True):
print(i)

You should consider using this to be able to get all links:
from bs4 import BeautifulSoup
import requests
content = requests.get("https://stackoverflow.com/").content
soup = BeautifulSoup(content, features="html.parser")
links = [a['href'] for a in soup.find_all('a', href=True)]
print(links)
To be able to search by a class you should use this:
from bs4 import BeautifulSoup
import requests
content = requests.get("https://stackoverflow.com/").content
soup = BeautifulSoup(content, features="html.parser")
links = [a['href'] for a in soup.find_all('a', class_="question-hyperlink", href=True)]
print(links)
However after running the script i got a empty list printed what means it not found any link in the given class.

remove html tags from string using bs4

I'm trying to make a program to read the price of bitcoin from a website. I used bs4 and was bale to get the section I was looking for but its surrounded by the html tags.
output: <div class="priceValue___11gHJ">$52,693.18</div>
I just want the price and i have tried the regex and lxml methods, but I keep getting errors
import requests
from bs4 import BeautifulSoup
#get url
url = "https://coinmarketcap.com/currencies/bitcoin/"
r = requests.get(url)
#parse html
soup = BeautifulSoup(r.content, 'html5lib')
#find div
find_div = soup.find('div', {"class": "priceValue___11gHJ"})
print(find_div)

You need to do .text:
import requests
from bs4 import BeautifulSoup
#get url
url = "https://coinmarketcap.com/currencies/bitcoin/"
r = requests.get(url)
#parse html
soup = BeautifulSoup(r.content, 'html5lib')
#find div
find_div = soup.find('div', {"class": "priceValue___11gHJ"})
print(find_div.text) # $52,693.18

locating child element by BeautifulSoup

I am new to BeautifulSoup and I am praticing with little tasks. Here I try to get the "previous" link in this site. The html is
here
My code is
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.find('div', id="comic")
url2 = result.find('ul', class_='comicNav').find('a', rel='prev').find('href')
But it shows NoneType.. I have read some posts about the child elements in html, and I tried some different things. But it still does not work.. Thank you for your help in advance.

Tou could use a CSS Selector instead.
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = soup.select('.comicNav a[rel~="prev"]')[0]
print(result)
if you want just the href change
result = soup.select('.comicNav a[rel~="prev"]')[0]["href"]

To get prev link.find ul tag and then find a tag. Try below code.
import requests, bs4
from bs4 import BeautifulSoup
url = 'https://www.xkcd.com/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
url2 = soup.find('ul', class_='comicNav').find('a',rel='prev')['href']
print(url2)
Output:
/2254/

Extracting links from html from the link of the following website

I want to extract the link
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
from the html of the page
http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05
The following is the code that is used
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
soup = BeautifulSoup(html.text,'html.parser')
link = soup.find_all('a')
print(link)
using beautiful soup. How would I go about it, using find_all('a") doesn't return the required link in the returned html.

Please try this to get Exact Url you want.
import bs4 as bs
import requests
import re
sauce = requests.get('https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=CHC&durationType=Y&Year=2018')
soup = bs.BeautifulSoup(sauce.text, 'html.parser')
for a in soup.find_all('a', href=re.compile("company_info")):
# print(a['href'])
if 'pageno' in a['href']:
print(a['href'])
output:
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=2&next=0&durationType=Y&Year=2018&duration=1&news_type=
/stocks/company_info/stock_news.php?sc_id=CHC&scat=&pageno=3&next=0&durationType=Y&Year=2018&duration=1&news_type=

You just have to use the get method to find the href attribute:
from bs4 import BeautifulSoup as soup
import requests
url_list = "http://www.moneycontrol.com/company-article/piramalenterprises/news/PH05#PH05"
html = requests.get(url_list)
page= soup(html.text,'html.parser')
link = page.find_all('a')
for l in link:
print(l.get('href'))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I exclude certain beautifulsoup results that I don't want? - python

Related

How do I get certain links from a website, but not all of them?

Not Getting Expected find results with BeautifulSoup

remove html tags from string using bs4

locating child element by BeautifulSoup

Extracting links from html from the link of the following website

Categories

Resources