Trouble with my output when webscraping website - python

I would like to scrape all the name of the company on all the links right here :
https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html
In each of those links, there are several companies, like here :
https://www.bilansgratuits.fr/classement/6420Z/default.html
My goal is to have all those companies for all the links.
Here's my script so far :
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
def clean_text(text):
text = tokenizer.tokenize(text)
final_text = ' '.join( [w for w in text] )
return final_text
url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'
links = []
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
names = []
root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]
for url in urls[:3]:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
except:
name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
names.append(name)
for i in range(0,3):
rx = re.compile(r'^\s+$')
names[i] = [item.split() for item in names[i] if not rx.match(item)]
data = pd.DataFrame({
'names' : names
})
data['names']= data['names'].apply(str)
data['names']= data['names'].apply(lambda x : clean_text(x))
print(data)
#data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')
I have this output :
But that's not what I want, I would like to have for each row, a name of a company.
Like that :
And so on for all the names.

Is this want you want?
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html"
html = requests.get(url).text
follow_urls = [
f"https://www.bilansgratuits.fr{anchor['href']}" for anchor
in BeautifulSoup(html, "html.parser").select(".titreElementAnnuaire a")
]
data = []
for follow_url in follow_urls:
print(f"Fetching: {follow_url}")
css_selector = ".titreElementAnnuaire a" if "6411Z" in follow_url else ".classementTop .blocRaisonSociale > a"
company_urls = BeautifulSoup(
requests.get(follow_url).text,
"html.parser",
).select(css_selector)
data.extend(
[
[
" ".join(anchor.getText(strip=True).split()),
f"https://www.bilansgratuits.fr{anchor['href']}",
] for anchor in company_urls
]
)
pd.DataFrame(data).to_csv("your_data.csv", index=False, header=["Company", "URL"])
print("Done!")
Output: a 345 entries in a .csv file:

Here's my final answer !
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import itertools
url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'
links = []
results = requests.get(url)
#time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
secteur = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
secteurs = []
URLS = []
names = []
root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]
for url, secteur in zip(urls[:3], secteur[:3]):
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
for i in name:
URLS.append(url)
for i in name:
secteurs.append(secteur)
except:
name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
for i in name:
URLS.append(url)
for i in name:
secteurs.append(secteur)
names.append(name)
for i in range(0,3):
rx = re.compile(r'^\s+$')
names[i] = [item.split() for item in names[i] if not rx.match(item)]
res = []
for list in names:
for lis in list:
res.append(' '.join([w for w in lis]))
data = pd.DataFrame({
'names' : res,
'URL' : URLS,
'Secteur' : secteurs
})
data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')

Related

Webscraping - html issue

Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
Ive been trying this code and getting some success but cannot figure out the next step
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('\n'*3)
I end up with the child as Rider (result text below)
#Step2
child : Rider
What Im trying to capture is the 'points per speciality' and the values.
There is a second question as to why I only get two tags when they all appear to have the same name?
Photo and arrow showing desired result
import re
url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
m = re.search("(\d+)(.*)", l.text)
d[m.group(2)] = m.group(1)
print(headers)
print(d)
# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}

Unable to prints the Names and links in python

I got stuck on extracting names and links it doesn't any response but it prints prices.
link from where I scraping is: https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
for property in content:
links = property.find('div',{'class': 'col-md-12 d-table-cell align-middle'})['href']
name= property.find('img', class_ = 'img-fluid').text.strip()
price= property.find('div', class_ = 'ProductPriceRating d-table-cell text-center pl-1 pr-1 align-middle').text.strip()
print(name,links,price)
You can try like this
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
# time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
# print(content)
header = ['url', 'item', 'price']
data = []
for property in content:
link =[i['href'] for i in property.findAll("a")][0]
title = [i.getText(strip=True) for i in property.find_all("a")][1]
price = [i.getText(strip=True) for i in property.find_all('div',{'class':"ProductPriceRating"})][0]
data.append([link, title, price])
print(data)
df = pd.DataFrame(data, columns=header)
df.to_csv("products.csv")

Iterate Over URLs Using BeautifulSoup

I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,

Data Scrape Output into Dataframe

Hello Everyone I have Scraped this Information from a JobListing site so far. Everything seems to work well however I am struggling to get this information into a data frame with headers and everything. Any Help is appreciated.
My Full code is:
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
print()
Not sure how to approach this.
use concat() for all columns and then append() to one dataframe in loop
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
df= pd.DataFrame()
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
df1=pd.concat([pd.Series(title_elem.text.strip()),
pd.Series(company_elem.text.strip()),
pd.Series(location_elem.text.strip())],axis=1)
df=df.append(df1)
print(df)
You can save the job details (i.e, title, company, and location) in a dictionary, then dataframe the dictionary.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='ResultsContainer')
python_jobs = results.find_all('h2',string=lambda text: 'test' in text.lower())
for p_job in python_jobs:
link = p_job.find('a')['href']
print(p_job.text.strip())
print(f"Apply Here: {link}")
job_elems = results.find_all('section', class_= 'card-content')
i = 1
my_job_list = {}
for job_elem in job_elems:
title_elem = job_elem.find('h2', class_='title')
company_elem = job_elem.find('div', class_='company')
location_elem = job_elem.find('div', class_='location')
if None in (title_elem, company_elem, location_elem):
continue
op = f'opening {i}'
my_job_list[op] = {'position':title_elem.text.strip(), 'company':
company_elem.text.strip(), 'location': location_elem.text.strip()}
i= i+1
print(title_elem.text.strip())
print(company_elem.text.strip())
print(location_elem.text.strip())
df = pd.DataFrame(my_job_list)
print(df)

Loop through different links on a website and scrape certain information

Good afternoon all, i'm hoping that somebody may help me with a problem relating to looping through multiple links on a website. Many thanks in anticipation of your help. I have this code below which gets the info i need from the first link and creates the df i need to present it. But there are more than 6oo more links on the website and im not sure how to go about it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://auctions.royaltyexchange.com/auctions_overview/"
html = urlopen("https://auctions.royaltyexchange.com/auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview")
soup = BeautifulSoup(html, 'lxml')
type(soup)
# Get the title
title = soup.title
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
title
data = {'Name':['Title',title]}
df_title = pd.DataFrame(data)
irr = soup.find('span',attrs={'id':'current-irr'}).text.strip()
irr
data = {'value' : ['theoretical IRR',irr]}
df_irr = pd.DataFrame(data)
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df_table = pd.DataFrame(pd.DataFrame(res).transpose())
df_final = pd.concat([df_title,df_irr ,df_table], axis=1, ignore_index = True)
df_final.head()
You can use this to get all the links on all pages primarily.
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
raw_url = "https://auctions.royaltyexchange.com/"
def get_link(page_num):
global raw_url
link_ls = []
for page in range(1,page_num+1):
url = "https://auctions.royaltyexchange.com/auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find('div',{'class':'-list'}).findAll('a',href=re.compile("^(/auctions/)")):
print(link.attrs['href'])
link_ls.append(raw_url + link.attrs['href'])
return link_ls
link_list = get_link(55) # the last page number
link_list
['https://auctions.royaltyexchange.com//auctions/hip-hop-royalties-danileighs-lil-bebe/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/k-pop-publishing-featuring-exo-and-tvxq/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/jay-zs-multi-platinum-empire-state-of-mind/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/film-royalties-classic-comedy-trading-places/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/ben-jerrys-cherry-garcia-trademark-royalties/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/the-doobie-brothers-black-water-more/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/dirty-dancings-ive-had-the-time-of-my-life/?origin=overview&filter_value=overview',
'https://auctions.royaltyexchange.com//auctions/multi-platinum-hip-hop-collection/?origin=overview&filter_value=overview',
...
On each page, specify the data you want to extract (eg title, name, etc.) and tell it the type of dataframe.
A slight refactor of #yganalyst and your code:
import pandas as pd
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
def get_link(page_num, raw_url):
link_ls = []
for page in range(1, page_num+1):
url = raw_url + "auctions_overview/?origin=overview&page=" + str(page)
html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')
pobj = re.compile("^(/auctions/)")
for link in bs.find('div', {'class': '-list'}).findAll('a', href=pobj):
link_ls.append(raw_url + link.attrs['href'])
return link_ls
def extract_auction(url2):
data = {}
html = urlopen(url2)
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1', class_='title -auction-page -dark').text.strip()
data['Title'] = title
irr = soup.find('span', attrs={'id': 'current-irr'}).text.strip()
data['theoretical IRR'] = irr
table = soup.find('table', class_='es-overview-table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
key = row[0].replace(':', '')
data[key] = row[1]
return data
base_url = "https://auctions.royaltyexchange.com/"
page_num = 1
link_list = get_link(page_num, base_url)
data = []
for ll in link_list:
print(ll)
data.append(extract_auction(ll))
df_final = pd.DataFrame(data)

Categories