Webscraping - html issue - python

Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
#Step2
child : Rider
What Im trying to capture is the 'points per speciality' and the values.
There is a second question as to why I only get two tags when they all appear to have the same name?
Photo and arrow showing desired result

import re
url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
m = re.search("(\d+)(.*)", l.text)
d[m.group(2)] = m.group(1)
print(headers)
print(d)
# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}

Related

Unable to prints the Names and links in python

I got stuck on extracting names and links it doesn't any response but it prints prices.
link from where I scraping is: https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
for property in content:
links = property.find('div',{'class': 'col-md-12 d-table-cell align-middle'})['href']
name= property.find('img', class_ = 'img-fluid').text.strip()
price= property.find('div', class_ = 'ProductPriceRating d-table-cell text-center pl-1 pr-1 align-middle').text.strip()
print(name,links,price)
You can try like this
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
# time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
# print(content)
header = ['url', 'item', 'price']
data = []
for property in content:
link =[i['href'] for i in property.findAll("a")][0]
title = [i.getText(strip=True) for i in property.find_all("a")][1]
price = [i.getText(strip=True) for i in property.find_all('div',{'class':"ProductPriceRating"})][0]
data.append([link, title, price])
print(data)
df = pd.DataFrame(data, columns=header)
df.to_csv("products.csv")

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Data Scrape Output into Dataframe

Hello Everyone I have Scraped this Information from a JobListing site so far. Everything seems to work well however I am struggling to get this information into a data frame with headers and everything. Any Help is appreciated.
My Full code is:
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
print()
Not sure how to approach this.
use concat() for all columns and then append() to one dataframe in loop
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
df= pd.DataFrame()
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
df1=pd.concat([pd.Series(title_elem.text.strip()),
pd.Series(company_elem.text.strip()),
pd.Series(location_elem.text.strip())],axis=1)
df=df.append(df1)
print(df)
You can save the job details (i.e, title, company, and location) in a dictionary, then dataframe the dictionary.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
i = 1
my_job_list = {}
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
op = f'opening {i}'
my_job_list[op] = {'position':title_elem.text.strip(), 'company':
company_elem.text.strip(), 'location': location_elem.text.strip()}
i= i+1
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
df = pd.DataFrame(my_job_list)
print(df)

Loop through different links on a website and scrape certain information

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Why does BeautifulSoup fail to extract data from websites to csv?

User Chrisvdberge helped me creating the following code :
import pandas as pd
import requests
from bs4 import BeautifulSoup
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
req = requests.get(url_DAX, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DAX.csv')
print(df)
url_DOW = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-dow_quotes_settlements_futures.html'
req = requests.get(url_DOW, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/trading/equity-index/us-index/e-mini-nasdaq-100_quotes_settlements_futures.html'
req = requests.get(url_NASDAQ, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_NASDAQ.csv')
print(df)
url_CAC = 'https://live.euronext.com/fr/product/index-futures/FCE-DPAR/settlement-prices'
req = requests.get(url_CAC, verify=False)
html = req.text
soup = BeautifulSoup(html, 'lxml')
df = pd.read_html(str(html))[0]
df.to_csv('results_CAC.csv')
print(df)
I have the following result :
3 .csv files are created : results_DAX.csv (here, everything is ok, I have the values I want.) ; results_DOW.csv and results_NASDAQ.csv (here, the problem is that the .csv files don't have the wanted values.. I don't understand why ?)
As you can see in the code, 4 files should be created and not only 3.
So my questions are :
How to get 4 csv files ?
How to get values in the results_DOW.csv and in the results_NASDAQ.csv files ? (and maybe also in the results_CAC.csv file)
Thank you for your answers ! :)
Try this to get those other sites. The last site is a little trickier, so you'd need to try out Selenium:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date, timedelta
url_DAX = 'https://www.eurexchange.com/exchange-en/market-data/statistics/market-statistics-online/100!onlineStats?viewType=4&productGroupId=13394&productId=34642&cp=&month=&year=&busDate=20191114'
df = pd.read_html(url_DAX)[0]
df.to_csv('results_DAX.csv')
print(df)
dt = date.today() - timedelta(days=2)
dateParam = dt.strftime('%m/%d/%Y')
url_DOW = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/318/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920502874'}
response = requests.get(url_DOW, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_DOW.csv')
print(df)
url_NASDAQ = 'https://www.cmegroup.com/CmeWS/mvc/Settlements/Futures/Settlements/146/FUT'
payload = {
'tradeDate': dateParam,
'strategy': 'DEFAULT',
'pageSize': '500',
'_': '1573920650587'}
response = requests.get(url_NASDAQ, params=payload).json()
df = pd.DataFrame(response['settlements'])
df.to_csv('results_NASDAQ.csv')
print(df)

Categories