Using Python Beautifulsoup to collect data from LinkedIn - python

I'm trying to export my LinkedIn contacts names using python beautifulsoup module. my code is as bellow:
import requests
from bs4 import BeautifulSoup
client = requests.Session()
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
CONNECTIONS_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']
login_information = {
'session_key':'username',
'session_password':'password',
'loginCsrfParam': csrf,
}
try:
client.post(LOGIN_URL, data=login_information)
print "Login Successful"
except:
print "Failed to Login"
html = client.get(CONNECTIONS_URL).content
soup = BeautifulSoup(html , "html.parser")
print soup.find_all('div', attrs={'class' : 'mn-connection-card__name'})
but the problem is I always get an empty list. like bellow:
Login Successful
[]
An html structure is like this:
<span class="mn-connection-card__name t-16 t-black t-bold">
Sombody's name
</span>
I think that I should change my soup.x method. I used find, select, find_all but I was not successful.
Thank you

I know i'm late to the party, but this works for linkedin right now:
import requests
from bs4 import BeautifulSoup
#create a session
client = requests.Session()
#create url page variables
HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'
CONNECTIONS_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
ASPIRING_DATA_SCIENTIEST = 'https://www.linkedin.com/search/results/people/?keywords=Aspiring%20Data%20Scientist&origin=GLOBAL_SEARCH_HEADER'
#get url, soup object and csrf token value
html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find('input', dict(name='loginCsrfParam'))['value']
#create login parameters
login_information = {
'session_key':'your_email',
'session_password':'your_password',
'loginCsrfParam': csrf,
}
#try and login
try:
client.post(LOGIN_URL, data=login_information)
print("Login Successful")
except:
print("Failed to Login")
#open the html with soup object
# html = client.get(CONNECTIONS_URL).content #opens connections_url
html = client.get(ASPIRING_DATA_SCIENTIEST).content #opens ASPIRING_DATA_SCIENTIEST
soup = BeautifulSoup(html , "html.parser")
# print(soup.find_all('div', attrs={'class' : 'mn-connection-card__name'}))
# print(soup)
print(soup.prettify())

If you're trying to extract the name, all you need is
from bs4 import BeautifulSoup
soup = BeautifulSoup(html , "html.parser")
target = soup.find_all('span', attrs={'class' : 'mn-connection-card__name'})
target[0].text.strip()
Output
"Sombody's name"

Related

How to scrape stock data incorporating pagination next tag using python bs4?

The code can not get the next page, it only repeats in an infinite loop. I am using the example from oxylabs
Could you tell me what I'm doing wrong? Thank you.
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
url = 'https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html'
while True:
response = requests.get(url)
soup = bs(response.content, "lxml")
symbols = soup.find_all('td', class_='STOCK_CODE' )
for s in symbols:
symbol = s.find('a').text
print(symbol)
next_page = soup.select_one('span', id = 'next')
if next_page:
next_url = next_page.get('href')
url = urljoin(url, next_url)
else:
break
print(url)
The information you want for the other pages is being returned via another call. You need to recreate that call (use your browser's network tools to see what is happening).
The request requires a token that is returned when the homepage is requested. This needs to be provided when requesting the other pages.
For example:
from bs4 import BeautifulSoup as bs
import requests
session = requests.Session()
req_homepage = session.get('https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html')
soup_homepage = bs(req_homepage.content, "lxml")
for meta in soup_homepage.find_all('meta'):
if meta.get('name', None) == '__RequestVerificationToken':
token = meta['content']
data = {
"p_issearch" : 0,
"p_keysearch" : "",
"p_market_code" : "",
"p_orderby" : "STOCK_CODE",
"p_ordertype" : "ASC",
"p_currentpage" : 2,
"p_record_on_page" : 10,
}
headers = {
"Referer" : "https://hnx.vn/en-gb/cophieu-etfs/chung-khoan-ny.html",
"__RequestVerificationToken" : token,
"X-Requested-With" : "XMLHttpRequest",
}
for page in range(1, 4):
print(f"Page {page}")
data['p_currentpage'] = page
req = session.post('https://hnx.vn/ModuleIssuer/List/ListSearch_Datas', data=data, headers=headers)
json_content = req.json()['Content']
soup = bs(json_content, "lxml")
for td in soup.find_all('td', class_='STOCK_CODE'):
symbol = td.find('a').text
print(' ', symbol)
This would give you the following output:
Page 1
AAV
ACM
ADC
ALT
AMC
AME
AMV
API
APP
APS
Page 2
ARM
ART
ATS
BAB
BAX
BBS
BCC
BCF
BDB
BED
Page 3
BII
BKC
BLF
BNA
BPC
BSC
BST
BTS
BTW
BVS

Can't parse span id on beautifulsoup

i am trying to write a scraper but i have faced with an issue.
I can parse "class in spans" and "class in div" but when i try to parse "id in span" it doesn't print the data i want.
from bs4 import BeautifulSoup
from urllib import request
from urllib.request import Request, urlopen
req = Request('https://bscscan.com/token/0xc3d33bdd0b6cea10eb496fbc7592e45f2624c0a5', headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
name = soup.find('span', class_='text-secondary small').text
add = soup.find('div', class_='mr-3').text
trans = soup.find('span', attrs={'id':'totaltxns'}).text
print(name, add, trans)
You need to pick up a session cookie then make a request to an additional endpoint. sid needs to be dynamically picked up as well.
import requests, re
def get_transfer_count(str:token)->str:
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
r = s.get(f'https://bscscan.com/token/{token}')
sid = re.search(r"var sid = '(.*?)'", r.text).group(1)
r = s.get(f'https://bscscan.com/token/generic-tokentxns2?m=normal&contractAddress={token}&a=&sid={sid}&p=1')
return re.search(r"var totaltxns = '(.*?)'", r.text).group(1)
token = '0x8df9655178350146eAD837B15ba3D32c9Fe1497d'
get_transfer_count(token)

Problem sending data through post request in Python

I am trying to input a decision start and end date into 2 input boxes on the Gosport Council website by sending a post request. Whenever I print out the text received from after I send the request it gives me the info shown on the input page, not the loaded page
import requests
payload = {
"applicationDecisionStart": "1/8/2018",
"applicationDecisionEnd": "1/10/2018",
}
with requests.Session() as session:
r = session.get("https://publicaccess.gosport.gov.uk/online-applications/search.do?action=advanced", timeout=10, data=payload)
print(r.text)
If I execute it I want it to print out the HTML with the href links for example
<a href="/online-applications/applicationDetails.do?keyVal=PEA12JHO07E00&activeTab=summary">
But my code won't show anything like this
I observe the POST, not GET which you are doing, is as follows (ignoring empty fields in POST):
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
## later pages
#https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page=2
Loop over pages:
from bs4 import BeautifulSoup as bs
import requests
payload = {
'caseAddressType':'Application'
,'date(applicationDecisionStart)' :'1/8/2018'
,'date(applicationDecisionEnd)': '1/10/2018'
, 'searchType' : 'Application'
}
with requests.Session() as s:
r = s.post('https://publicaccess.gosport.gov.uk/online-applications/advancedSearchResults.do?action=firstPage', data = payload)
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
pages = int(soup.select('span + a.page')[-1].text)
for page in range(2, pages + 1):
r = s.get('https://publicaccess.gosport.gov.uk/online-applications/pagedSearchResults.do?action=page&searchCriteria.page={}'.format(page))
soup = bs(r.content, 'lxml')
info = [(item.text.strip(), item['href']) for item in soup.select('#searchresults a')]
print(info)
the url and data is incorrect
use Chrome to analysis the response
press f12 to open Developer tools,change to item "network".then submit your page,analysis the first request initiated by Chrome.
what you need:
Hearders-general-request url
Hearders-request headers
Hearders-data
you need some packages to parser th html, such as bs4

I'm getting doctors links by submitting the post requests using python beautifulsoup

import requests
from bs4 import BeautifulSoup
try:
for count in range(123401,123405):
ctl00_RightContetHolder_TextBox1 = count
r = requests.post('http://karnatakamedicalcouncil.com/RenewalReport.aspx',
data={'ctl00_RightContetHolder_TextBox1': count, 'Search': "submit"})
soup = BeautifulSoup(r.text, 'html.parser')
for i in soup.find('table', {'class': 'mGrid'}):
for links in i.find('a',class_='Viewdetails'):
print links
except:
pass
I am trying to get each of the links in the mGrid tables, but haven't been able to retrieve them with beautiful soup. I don't understand why the anchor tags are not being found, or if they are being found why they are not being printed. Please help me.
It missing required data __VIEWSTATE and __EVENTVALIDATION, to get it you need create GET request and extract hidden input value with that ID then you can create POST or search request with that data.
url = 'http://karnatakamedicalcouncil.com/RenewalReport.aspx'
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
VIEWSTATE = soup.find(id='__VIEWSTATE')['value']
EVENTVALIDATION = soup.find(id='__EVENTVALIDATION')['value']
for count in range(123401,123405):
data = {
'__VIEWSTATE' : VIEWSTATE,
'__VIEWSTATEENCRYPTED' : '',
'__EVENTVALIDATION' : EVENTVALIDATION,
'ctl00$RightContetHolder$TextBox1': count,
'ctl00$RightContetHolder$hdnSearch': "Search",
}
r = requests.post(url, data=data)
soup = BeautifulSoup(r.text, 'html.parser')
for links in soup.findAll('a', class_='Viewdetails'):
print links['href']

Scraping and parsing multi-page (aspx) table

I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)

Categories