Can't parse span id on beautifulsoup - python

i am trying to write a scraper but i have faced with an issue.
I can parse "class in spans" and "class in div" but when i try to parse "id in span" it doesn't print the data i want.
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import Request, urlopen
req = Request('https://bscscan.com/token/0xc3d33bdd0b6cea10eb496fbc7592e45f2624c0a5', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
name = soup.find('span', class_='text-secondary small').text
add = soup.find('div', class_='mr-3').text
trans = soup.find('span', attrs={'id':'totaltxns'}).text
print(name, add, trans)

You need to pick up a session cookie then make a request to an additional endpoint. sid needs to be dynamically picked up as well.
import requests, re
def get_transfer_count(str:token)->str:
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
r = s.get(f'https://bscscan.com/token/{token}')
sid = re.search(r"var sid = '(.*?)'", r.text).group(1)
r = s.get(f'https://bscscan.com/token/generic-tokentxns2?m=normal&contractAddress={token}&a=&sid={sid}&p=1')
return re.search(r"var totaltxns = '(.*?)'", r.text).group(1)
token = '0x8df9655178350146eAD837B15ba3D32c9Fe1497d'
get_transfer_count(token)

Related

How do I extract the underlined value in red below and save it as a list?

How do I extract the underlined value in red below and save it as a list?
You want to extract the Memcode value in href to a in p tag using soup.
However, I don't know how to extract it at all.
Please help me.
My code
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
print(href)
try this, using css selector
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://www.gjcouncil.go.kr/kr/member/name.do')
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.select("div[id='member_list'] > ul > li > a"):
print(a['href'].split("/")[2])
08070
00716
08040
....
....
You can use split on the "=" and take the -1 index. I also changed the class .
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
ids = [i['href'].split('=')[-1] for i in soup.select('.btn-home')]
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
href_list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
if href['href'] == '#LINK':
pass
else:
href_list.append(href['href'][-7:])
print(href_list)
['7620212', '7670126', '7670420', '7650601', '7890930', '7800407', '7660925', '7641102', '7731222', '7801011', '7570803', '7770106', '7590808', '7700831', '7580115', '7710713', '7680112', '7621125', '7711117', '7680213', '7640925', '7591214']
One of the best method is using Regular-Expression.
Check out this code :
import urllib.request
from bs4 import BeautifulSoup
import re
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list_ = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
list_.append(href['href'])
regobj = re.compile(r'memCode=(\w+)')
final = list(filter('#LINK'.__ne__, list_))
result = list(map(lambda i: regobj.search(i).group(1) ,final))
print(result)

Python request change lang

How I can get page with other language not default
My code:
import requests
from bs4 import BeautifulSoup
def scrape(page):
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
return soup
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232")
print(product)
and now when i open this page i get DE lang how I can get it with EN lang?
They havent other URL or prefix for language. Only changes in button.
Edit:
import re
import requests
from bs4 import BeautifulSoup
def scrape(page):
headers = {'Accept-Language': 'en-US,en;q=0.8'}
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
getPage= requests.get(url, headers=headers)
soup = BeautifulSoup(getPage.content, 'html.parser')
title=soup.select('.product-info-title-desktop')[0].text
return title
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232")
print(product)
Nothing change :/
Try it:
import re
import requests
from bs4 import BeautifulSoup
def scrape(page):
url = page
result = requests.get(url, stream=True)
if result.status_code == 200:
getPage= requests.get(url)
soup = BeautifulSoup(getPage.content, 'html.parser')
title=soup.select('.product-info-title-desktop')[0].text
return title
product=scrape("https://www.tatechnix.de/tatechnix/gx/product_info.php?info=p44232&language=en")
print(product)

BeautifulSoup: Reading Span Class Elements

I am having some issues web scraping information from a particular pages span class element, using the beautifulsoup and requests addon in python. It keeps returning me blank information: " ". Heres my code:
headers = {'User-Agent':'Mozilla/5.0'}
res = requests.get('https://www.theweathernetwork.com/ca/weather/ontario/toronto')
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
weather_elem = soup.find('span', {'class':'wxcondition'})
weather = weather_elem
print(weather)
return weather`
The data is loaded through JavaScript so BeautifulSoup doesn't see anything. But you can simulate the Ajax with the requests module:
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.theweathernetwork.com/ca/weather/ontario/toronto'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
place_code = soup.select_one('link[rel="alternate"]')['href'].split('=')[-1].lower()
ajax_url = 'https://weatherapi.pelmorex.com/api/v1/observation/placecode/' + place_code
data = requests.get(ajax_url).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
print(data['observation']['weatherCode']['text'])
Prints:
Partly cloudy

Problem sending data through post request in Python

I am trying to input a decision start and end date into 2 input boxes on the Gosport Council website by sending a post request. Whenever I print out the text received from after I send the request it gives me the info shown on the input page, not the loaded page
import requests
payload = {
"applicationDecisionStart": "1/8/2018",
"applicationDecisionEnd": "1/10/2018",
}
with requests.Session() as session:
r = session.get("https://publicaccess.gosport.gov.uk/online-applications/search.do?action=advanced", timeout=10, data=payload)
print(r.text)
If I execute it I want it to print out the HTML with the href links for example
<a href="/online-applications/applicationDetails.do?keyVal=PEA12JHO07E00&activeTab=summary">
But my code won't show anything like this
I observe the POST, not GET which you are doing, is as follows (ignoring empty fields in POST):
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
## later pages
#https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page=2
Loop over pages:
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
pages = int(soup.select('span + a.page')[-1].text)
for page in range(2, pages + 1):
r = s.get('https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page={}'.format(page))
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
the url and data is incorrect
use Chrome to analysis the response
press f12 to open Developer tools,change to item "network".then submit your page,analysis the first request initiated by Chrome.
what you need:
Hearders-general-request url
Hearders-request headers
Hearders-data
you need some packages to parser th html, such as bs4

Using Python Beautifulsoup to collect data from LinkedIn

I'm trying to export my LinkedIn contacts names using python beautifulsoup module. my code is as bellow:
import requests
from bs4 import BeautifulSoup
client = requests.Session()
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
CONNECTIONS_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_information = {
'session_key':'username',
'session_password':'password',
'loginCsrfParam': csrf,
}
try:
client.post(LOGIN_URL, data=login_information)
print "Login Successful"
except:
print "Failed to Login"
html = client.get(CONNECTIONS_URL).content
soup = BeautifulSoup(html , "html.parser")
print soup.find_all('div', attrs={'class' : 'mn-connection-card__name'})
but the problem is I always get an empty list. like bellow:
Login Successful
[]
An html structure is like this:
<span class="mn-connection-card__name t-16 t-black t-bold">
Sombody's name
</span>
I think that I should change my soup.x method. I used find, select, find_all but I was not successful.
Thank you
I know i'm late to the party, but this works for linkedin right now:
import requests
from bs4 import BeautifulSoup
#create a session
client = requests.Session()
#create url page variables
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
CONNECTIONS_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
ASPIRING_DATA_SCIENTIEST = 'https://www.linkedin.com/search/results/people/?keywords=Aspiring%20Data%20Scientist&origin=GLOBAL_SEARCH_HEADER'
#get url, soup object and csrf token value
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find('input', dict(name='loginCsrfParam'))['value']
#create login parameters
login_information = {
'session_key':'your_email',
'session_password':'your_password',
'loginCsrfParam': csrf,
}
#try and login
try:
client.post(LOGIN_URL, data=login_information)
print("Login Successful")
except:
print("Failed to Login")
#open the html with soup object
# html = client.get(CONNECTIONS_URL).content #opens connections_url
html = client.get(ASPIRING_DATA_SCIENTIEST).content #opens ASPIRING_DATA_SCIENTIEST
soup = BeautifulSoup(html , "html.parser")
# print(soup.find_all('div', attrs={'class' : 'mn-connection-card__name'}))
# print(soup)
print(soup.prettify())
If you're trying to extract the name, all you need is
from bs4 import BeautifulSoup
soup = BeautifulSoup(html , "html.parser")
target = soup.find_all('span', attrs={'class' : 'mn-connection-card__name'})
target[0].text.strip()
Output
"Sombody's name"

Categories