Loop through different links on a website and scrape certain information - python

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()

You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.

A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Related

Unable to prints the Names and links in python

I got stuck on extracting names and links it doesn't any response but it prints prices.
link from where I scraping is: https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
for property in content:
links = property.find('div',{'class': 'col-md-12 d-table-cell align-middle'})['href']
name= property.find('img', class_ = 'img-fluid').text.strip()
price= property.find('div', class_ = 'ProductPriceRating d-table-cell text-center pl-1 pr-1 align-middle').text.strip()
print(name,links,price)
You can try like this
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
# time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
# print(content)
header = ['url', 'item', 'price']
data = []
for property in content:
link =[i['href'] for i in property.findAll("a")][0]
title = [i.getText(strip=True) for i in property.find_all("a")][1]
price = [i.getText(strip=True) for i in property.find_all('div',{'class':"ProductPriceRating"})][0]
data.append([link, title, price])
print(data)
df = pd.DataFrame(data, columns=header)
df.to_csv("products.csv")

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Having trouble in scraping table data using beautiful soup

I would like to scrape the table data from this site. I've tried the code below but for whatever reason, BS4 seems unable to fetch the table data:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('table', attrs={"id": "table"})
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
I would really appreciate your help :)
You used wrong tag and id name to find the right table. The following should work:
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://drafty.cs.brown.edu/csprofessors').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('template', attrs={"id":"table-data"})
for tr in table.find_all('tr'):
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
import requests
from bs4 import BeautifulSoup as bs4
url = ('https://drafty.cs.brown.edu/csprofessors')
response = requests.get(url)
if response.ok:
data = list()
soup = bs4(response.text, 'html.parser')
fullnames = soup.select('td:nth-child(1)')
university = soup.select('td:nth-child(2)')
join_year = soup.select('td:nth-child(3)')
sub_field = soup.select('td:nth-child(4)')
bachelors = soup.select('td:nth-child(5)')
doctorate = soup.select('td:nth-child(6)')
for item in range(1, len(fullnames) + 1):
data.append(
[
{
'fullnames': fullnames,
'university': university,
'join_year': join_year,
'sub_field': sub_field,
'bachelors': bachelors,
'doctorate': doctorate
}
]
)
You can simply use selenium combined with pandas to scrape the table. Here is how you do it:
import pandas as pd
from selenium import webdriver
import time
url = 'https://drafty.cs.brown.edu/csprofessors'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="welcome-screen"]/div/div/div[1]/button').click()
time.sleep(1)
page = driver.page_source
df = pd.read_html(page)[0]
print(df)

Data Scrape Output into Dataframe

Hello Everyone I have Scraped this Information from a JobListing site so far. Everything seems to work well however I am struggling to get this information into a data frame with headers and everything. Any Help is appreciated.
My Full code is:
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
print()
Not sure how to approach this.
use concat() for all columns and then append() to one dataframe in loop
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
df= pd.DataFrame()
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
df1=pd.concat([pd.Series(title_elem.text.strip()),
pd.Series(company_elem.text.strip()),
pd.Series(location_elem.text.strip())],axis=1)
df=df.append(df1)
print(df)
You can save the job details (i.e, title, company, and location) in a dictionary, then dataframe the dictionary.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
i = 1
my_job_list = {}
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
op = f'opening {i}'
my_job_list[op] = {'position':title_elem.text.strip(), 'company':
company_elem.text.strip(), 'location': location_elem.text.strip()}
i= i+1
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
df = pd.DataFrame(my_job_list)
print(df)

Get value between tags TD (python)

I want to take values between td tags. I tried to write the code, but I think it can be improved, made more beautiful, please tell me.
from bs4 import BeautifulSoup
import requests
invite_date = str()
url = 'http://reestr.nostroy.ru/reestr/clients/233/members/5801625'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = soup.find('table', class_='items table')
i = 0
for tr in news.find_all('tr'):
if tr.find('td'):
if i == 6:
cols = tr.findAll('td')
for t in cols:
invite_date = t.text
i += 1
print (invite_date)
For getting only invite_date. You can try it:
from bs4 import BeautifulSoup
import requests
invite_date = str()
url = 'http://reestr.nostroy.ru/reestr/clients/233/members/5801625'
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html.parser')
news = soup.find('table', class_='items table')
invite_date = news.find_all('tr')[7].td.text
print(invite_date)
Output will be:
21.05.2019

Categories