beautifulsoup find function returns "-" when retrieving text

beautifulsoup find function returns "-" when retrieving text - python

I am trying to get the text value inside a span tag having an id attribute using beautifulsoup. But it returs no text, rather only a '-'.
I have tried scraping using the div tag with the class attribute and then navigating to the span tag using findChildren() function too, but it still returns a "-". Here is the html that I am trying to scrape from the website https://etherscan.io/tokens-nft.
<div class="row align-items-center">
<div class="col-md-4 mb-1 mb-md-0">Transfers:</div>
<div class="col-md-8"></div>
<span id="totaltxns">266,765</span><hr class="hr-space">
</div>
And here is my python code:
from urllib2 import Request,urlopen
from bs4 import BeautifulSoup as soup
import array
url = 'https://etherscan.io/tokens-nft'
response = Request(url, headers = {'User-Agent':'Mozilla/5.0'})
page_html = urlopen(response).read()
page_soup = soup (page_html,'html.parser')
count = 0
total_nfts = 2691 #Hard-coded value
supply = []
totalAddr = []
transCount = []
row = []
print('All non-fungible tokens in order of Transfers')
for nfts in page_soup.find_all("a", class_ ='text-primary'):
link = nfts.get('href')
new_url = "https://etherscan.io/"+link
name = nfts.text
print('NFT '+name)
response2 = Request(new_url, headers = {'User-Agent':'Mozilla/5.0'})
phtml = urlopen(response2).read()
psoup = soup (phtml,'html.parser')
#Get tags
tags = []
#print('Tags')
for allTags in psoup.find_all("a",class_ = 'u-label u-label--xs u-label--secondary'):
tags.append(allTags.text.encode("ascii"))
count+=1
if(len(tags)!=0):
print(tags)
#Get total supply
ts = psoup.find("span", class_ = "hash-tag text-truncate")
ts = ts.text
#print(ts)
#Get holders
holders = psoup.find("div", {"id":"ContentPlaceHolder1_tr_tokenHolders"})
holders = holders.findChildren()[1].findChildren()[1].text
#print(holders)
#Get transfers/transactions
print(psoup.find("span", attrs={"id":"totaltxns"}).text)
print('Total number of NFTS '+str(count))
I have also tried:
transfers = psoup.find("span", attrs={"id":"totaltxns"})
but that doesn't work either.
The correct parsing should return 266,765.

To find the element by id you can use soup.find(id='your_id').
Try this:
from bs4 import BeautifulSoup as bs
html = '''
<div class="row align-items-center">
<div class="col-md-4 mb-1 mb-md-0">Transfers:</div>
<div class="col-md-8"></div>
<span id="totaltxns">266,765</span><hr class="hr-space">
</div>
'''
soup = bs(html, 'html.parser')
print(soup.find(id='totaltxns').text)
Outputs:
266,765
If you look at the page source for the link you've mentioned, the value in totaltxns is -. That's why it's returning -.
The value might just be populated with some javascript code on the page.
UPDATE
urlopen().read() simply returns the initial page source received from the server without any further client-side changes.
You can achieve your desired output using Selenium + Chrome WebDriver. The idea is we let the javascript in page run and parse the final page source.
Try this:
from bs4 import BeautifulSoup as bs
from selenium.webdriver import Chrome # pip install selenium
from selenium.webdriver.chrome.options import Options
url='https://etherscan.io/token/0x629cdec6acc980ebeebea9e5003bcd44db9fc5ce'
#Make it headless i.e. run in backgroud without opening chrome window
chrome_options = Options()
chrome_options.add_argument("--headless")
# use Chrome to get page with javascript generated content
with Chrome(executable_path="./chromedriver", options=chrome_options) as browser:
browser.get(url)
page_source = browser.page_source
#Parse the final page source
soup = bs(page_source, 'html.parser')
print(soup.find(id='totaltxns').text)
Outputs:
995,632
More info on setting up webdriver + example is in another StackOverflow question here.

Related

How to scrape <span > and next <p>?

I am trying to scrape some information from a webpage using Selenium. In <span id='text'>, I want to extract the id value (text) and in the same div I want to extract <p> element.
here is what I have tried:
import requests
from bs4 import BeautifulSoup
# Send an HTTP request to the website and retrieve the HTML code of the webpage
response = requests.get('https://www.osha.gov/laws-regs/regulations/standardnumber/1926/1926.451#1926.451(a)(6)')
html = response.text
# Parse the HTML code using Beautiful Soup to extract the desired information
soup = BeautifulSoup(html, 'html.parser')
# find all <a> elements on the page with name attribute
links = soup.find_all('a', attrs={'name': True})
print(links)
linq = []
for link in links:
#print(link['name'])
linq.append(link['name'])
information = soup.find_all('p') # find all <p> elements on the page
# This is how I did it
with open('osha.txt', 'w') as f:
for i in range(len(linq)):
f.write(linq[i])
f.write('\n')
f.write(infoo[i])
f.write('\n')
f.write('-' * 50)
f.write('\n')
Below is the HTML code.
What I want is to save this in a separate text file is this information:
1926.451(a)
Capacity
<div class="field--item">
<div class="paragraph paragraph--type--regulations-standard-number paragraph--view-mode--token">
<span id="1926.451(a)">
<a href="/laws-regs/interlinking/standards/1926.451(a)" name="1926.451(a)">
1926.451(a)
</a>
</span>
<div class="field field--name-field-standard-paragraph-body-p">
<p>"Capacity"</p>
</div>
</div>
</div>

Some of the a tag and paragraph you might missing on the page.
Use try except block to handle that.
Use css selector to get the parent node and then get respective child nodes.
user dataframe to store the value and export it to csv file.
import pandas as pd
import csv
import requests
from bs4 import BeautifulSoup
# Send an HTTP request to the website and retrieve the HTML code of the webpage
response = requests.get('https://www.osha.gov/laws-regs/regulations/standardnumber/1926/1926.451#1926.451(a)(6)')
html = response.text
code=[]
para=[]
# Parse the HTML code using Beautiful Soup to extract the desired information
soup = BeautifulSoup(html, 'html.parser')
for item in soup.select(".field.field--name-field-reg-standard-number .field--item"):
try:
code.append(item.find("a").text.strip())
except:
code.append(item.find("span").text.strip())
try:
para.append(item.find("p").text.strip())
except:
para.append("Nan")
df=pd.DataFrame({"code" : code, "paragraph" : para})
print(df)
df.to_csv("path/to/filenme")
Output:

How to get all the webpage elements

I am entirely new to webpage scraping and have been looking at a few YouTube videos and online to get me started.
So far, I have been trying to get all the webpage elements from the following website: https://www.letsride.co.uk/routes/search?sort_by=rating
Here is what I have so far:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
s = HTMLSession()
url = 'https://www.letsride.co.uk/routes/search?sort_by=rating'
def getdata(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
# for i in range(1, 103):
def getnextpage(soup):
page = soup.find('ul', {'class': 'pagination'})
return page
soup = getdata(url)
print(getnextpage(soup))
This prints:
<ul class="pagination">
<li class="disabled"><span>«</span></li>
<li class="active"><span>1</span></li>
<li>2</li>
<li>3</li>
<li>4</li>
<li>5</li>
<li>6</li>
<li>7</li>
<li>8</li>
<li class="disabled"><span>...</span></li>
<li>101</li>
<li>102</li>
<li>»</li>
</ul>
Which is not exactly what I am looking for, I wanted to return only the html elements from the first page to the last page for example:
https://www.letsride.co.uk/routes/search?sort_by=rating&page=1
https://www.letsride.co.uk/routes/search?sort_by=rating&page=2
...
..
.
https://www.letsride.co.uk/routes/search?sort_by=rating&page=102

You can use selenium with python to simulate a browser and get the site then click on the button as many times as you want or until the button is no longer there. I chose to do it only 10 times because the list seems to be almost infinite.
Then I printed out all the URLs on the site, but you can just as easily store them in a list instead.
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
options = Options()
options.headless = False
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get("https://www.letsride.co.uk/routes/search?sort_by=rating")
load_more = True
#while load_more:
for i in range(10):
time.sleep(0.2)
try:
load_more_btn = driver.find_element_by_xpath('/html/body/div[2]/section/div[2]/div/div[3]/div/a')
load_more_btn.click()
except:
load_more = False
links = driver.find_elements_by_xpath("//a[#href]")
for link in links:
print(link.get_attribute('href'))

you could use a cleaning function to get rid of non-url elements - basically you need to check each element against a variable that has the canonical url form (https:// ....)
i haven't exactly tested the proof on your code, sorry, hope you'll be able to add it accordingly.
tester = "https://www.letsride.co.uk" #modify this var accordingly to your needs
def cleaner(data):
clean_data = []
for items in data:
if items[0:len(tester)] == tester:
clean_data.append(items)
return clean_data

BeautifulSoup cannot find 'class' href

This is the page I'm trying to scrape:
https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2
I want to scrape the lisbox href next to Token label.
I need to scrape href from
class="link-hover d-flex justify-content-between align-items-center"
so my code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2').text
html = BeautifulSoup(page, 'html.parser')
href = html.find(class_ = 'link-hover d-flex justify-content-between align-items-center')['href']
however the result is nothing.
Can anyone help me?

The element of interest is rendered by JavaScript. Thus, you will need some browser automation software to render the JavaScript, in order to get the full HTML necessary.
Note: You could use requests-html which supports JavaScript rendering. However, it does use a browser automation software itself, so, in my opinion, it's best to get rid of the "middle-man".
Selenium
from selenium import webdriver
browser = webdriver.Firefox()
browser.get("https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2")
elem = browser.find_element_by_id("availableBalanceDropdown")
elem.click()
soup = bs4.BeautifulSoup(browser.page_content(), features="html.parser")
Playwright
from playwright.sync_api import sync_playwright
with sync_playwright() as play:
browser = play.chromium.launch()
page = browser.new_page()
page.goto("https://etherscan.io/address/0xCcE984c41630878b91E20c416dA3F308855E87E2")
page.click("#availableBalanceDropdown")
soup = bs4.BeautifulSoup(page.content(), features="html.parser")
browser.quit()
Once you have the bs4.BeautifulSoup object, it's just a matter of scraping for the CSS selector.
import bs4
soup = bs4.BeautifulSoup(...) # From above examples
elems = soup.select(".link-hover.d-flex.justify-content-between.align-items-center")

Can't figure out Beautifulsoup find() command for this HTML

I am trying to scrape some info from a page with python and Beautiful soup and i cant seem to write the right path to what i need, the html is:
<div class="operator active" data-operator_name="Etisalat" data-
operator_id="5"><div class="operator_name_etisalat"></div></div>
And i am trying to get that operator name "Etisalat", i got this far:
def list_contries():
select = Select(driver.find_element_by_id('international_country'))
select.select_by_visible_text('France')
request = requests.get("https://mobilerecharge.com/buy/mobile_recharge?country=Afghanistan&operator=Etisalat")
content = request.content
soup = BeautifulSoup(content, "html.parser")
# print(soup.prettify())
prov=soup.find("div", {"class": "operator active"})['data-operator_name']
# prov = soup.find("div", {"class": "operator deselected"})
print(prov)
operator = (prov.text.strip())
But this just returns a NoneType .. so something is not right, can anyone please tell me what am i doing wrong ? Thanks.

You could use CSS selector. CSS selector [data-operator_name] will select any tag with attribute data-operator_name. Example with Beautiful Soup:
data = """<div class="operator active" data-operator_name="Etisalat" data-
operator_id="5"><div class="operator_name_etisalat"></div></div>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
print(soup.select_one('[data-operator_name]')['data-operator_name'])
This will print:
Etisalat
EDIT:
To select multiple tags with attribute "data-operator_name", use .select() method:
data = """<div class="operator active" data-operator_name="Etisalat" data-
operator_id="5"><div class="operator_name_etisalat"></div></div>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, 'lxml')
for tag in soup.select('[data-operator_name]'):
print(tag['data-operator_name'])

Somehow, when I access the link from the browser, I am not able to see the field you are after unless I inspect the element. Hence, I have used Selenium in my answer.
from bs4 import BeautifulSoup
from selenium import webdriver
scrapeLink = 'https://mobilerecharge.com/buy/mobile_recharge?country=Afghanistan&operator=Etisalat'
driver = webdriver.Firefox(executable_path = 'C:\geckodriver.exe')
driver.get(scrapeLink)
html = driver.execute_script('return document.body.innerHTML')
driver.close()
soup = BeautifulSoup(html,'html.parser')
operator = len(soup.find_all('div', class_ = 'operator'))
for i in range(operator):
print(soup.find_all('div', class_ = 'operator')[i].get('data-operator_name'))
Output:
Roshan
Etisalat
MTN
Wireless

Using beautifulsoup in python to get link names and "selecting" links instead of limiting?

I've got the following code trying to return data from some html, however I am unable to return what I require...
import urllib2
from bs4 import BeautifulSoup
from time import sleep
def getData():
htmlfile = open('C:/html.html', 'rb')
html = htmlfile.read()
soup = BeautifulSoup(html)
items = soup.find_all('div', class_="blocks")
for item in items:
links = item.find_all('h3')
for link in links:
print link
getData()
Returns the a list of following:
<h3>
<a href="http://www.mywebsite.com/titles" title="Click for details(x)">
TITLE STUFF HERE (YES)
</a>
</h3>
<h3>
<a href="http://www.mywebsite.com/titles" title="Click for details(x)">
TITLE STUFF HERE (MAYBE)
</a>
</h3>
I want to be able to return just the title: TITLE STUFF HERE (YES) and TITLE STUFF HERE (MAYBE)
Another thing I want to be able to do to use the
soup.find_all("a", limit=2) function but instead of "limit" and instead of returning two results only I want it to return ONLY the second link... so a select feature not a limit? (Does such a feature exist?)

import urllib2
from bs4 import BeautifulSoup
from time import sleep
def getData():
htmlfile = open('C:/html.html', 'rb')
html = htmlfile.read()
soup = BeautifulSoup(html)
items = soup.find_all('div', class_="blocks")
for item in items:
links = item.find_all('a')
for link in links:
if link.parent.name == 'h3':
print(link.text)
getData()
You can also just find all the links from the very beginning and check both the parent is h3 and the parent's parent is a div with class blocks

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

beautifulsoup find function returns "-" when retrieving text - python

Related

How to scrape <span > and next <p>?

How to get all the webpage elements

BeautifulSoup cannot find 'class' href

Can't figure out Beautifulsoup find() command for this HTML

Using beautifulsoup in python to get link names and "selecting" links instead of limiting?

Categories

Resources