here is my current code to scrape specific player data from a site:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
from pandas import ExcelWriter
import lxml
import xlsxwriter
page = requests.get('https://www.futbin.com/players?page=1')
soup = BeautifulSoup(page.content, 'lxml')
pool = soup.find(id='repTb')
pnames = pool.find_all(class_='player_name_players_table')
pprice = pool.find_all(class_='ps4_color font-weight-bold')
prating = pool.select('span[class*="form rating ut20"]')
all_player_names = [name.getText() for name in pnames]
all_prices = [price.getText() for price in pprice]
all_pratings = [rating.getText() for rating in prating]
fut_data = pd.DataFrame(
{
'Player': all_player_names,
'Rating': all_pratings,
'Price': all_prices,
})
writer = pd.ExcelWriter('file.xlsx', engine='xlsxwriter')
fut_data.to_excel(writer,'Futbin')
writer.save()
print(fut_data)
This is working fine for the first page. But I need to go through 609 pages in total and get the data from all pages.
Could you please help me to re-write this code to make that working? I am still new and learning with this project.
You can iterate over all 609 pages, parse each page and at the end save collected data to file.xlsx:
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_player_names = []
all_pratings = []
all_prices = []
for i in range(1, 610):
page = requests.get('https://www.futbin.com/players?page={}'.format(i))
soup = BeautifulSoup(page.content, 'lxml')
pool = soup.find(id='repTb')
pnames = pool.find_all(class_='player_name_players_table')
pprice = pool.find_all(class_='ps4_color font-weight-bold')
prating = pool.select('span[class*="form rating ut20"]')
all_player_names.extend([name.getText() for name in pnames])
all_prices.extend([price.getText() for price in pprice])
all_pratings.extend([rating.getText() for rating in prating])
fut_data = pd.DataFrame({'Player': all_player_names,
'Rating': all_pratings,
'Price': all_prices})
writer = pd.ExcelWriter('file.xlsx', engine='xlsxwriter')
fut_data.to_excel(writer, 'Futbin')
writer.save()
Related
I am unsure how to add the column data to the empty dataframe.
Here is my code.
!pip install bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_largest_banks?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkPY0221ENSkillsNetwork23455645-2021-01-01'
r = requests.get(url)
data = r.content
from bs4 import BeautifulSoup
soup= BeautifulSoup(data, 'html.parser')
data = pd.DataFrame(columns=["Name", "Market Cap (US$ Billion)"])
import pandas as pd
for row in soup.find_all('tbody')[3].find_all('tr'):
col = row.find_all('td')
When I use this code
data.head(5)
I retrieve an empty Dataframe.
This was a lot of digging and troubleshooting... but I keep forgetting that its always the case with webscraping.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_largest_banks?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkPY0221ENSkillsNetwork23455645-2021-01-01'
r = requests.get(url)
data = r.content
soup = BeautifulSoup(data, 'html.parser')
data = {
'Name': [],
'Market Cap (US$ Billion)': []
}
x = soup.find_all('tbody')[3].find_all('tr')
for i in range(1, len(x)):
data['Name'].append(str(x[i].find_all('td')[1].find_all('a')[1]).split('>')[1][:-3])
data['Market Cap (US$ Billion)'].append(str(x[i].find_all('td')[2])[4:-6])
df = pd.DataFrame(data)
df.head()
I have written some code to gather URLs for each race course from https://www.horseracing.net/racecards. I have also written some code to scrape data from each race course page.
Each bit of code works as it should but I am having trouble creating a for loop to loop through all the race course URLs.
Here's the code to scrape the course URLs:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
reqs = requests.get(todays_racecard_url)
content = reqs.text
soup = BeautifulSoup(content, 'html.parser')
course_urls = []
for h in soup.findAll('h3'):
a = h.find('a')
try:
if 'href' in a.attrs:
card_url = urljoin(base_url, a.get('href'))
course_urls.append(card_url)
except:
pass
for card_url in course_urls:
print(card_url)
And here's the code to scrape the pages:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.horseracing.net/racecards/fontwell/13-05-21"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
date = []
course = []
time = []
runner = []
tips = []
tipsters = []
runner_div = soup.find_all('div', class_='row-cell-right')
for container in runner_div:
runner_name = container.h5.a.text
runner.append(runner_name)
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tips.append(tips_no)
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
tipsters.append(tipster_names)
newspaper_tips = pd.DataFrame({
'Runners': runner,
'Tips': tips,
'Tipsters': tipsters,
})
newspaper_tips['Tipsters'] = newspaper_tips['Tipsters'].str.replace(' - ', '')
newspaper_tips.to_csv('NewspaperTips.csv', mode='a', header=False, index=False)
How do I join them to get the result I'm looking for?
It could be combined as follows:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
todays_racecard_url = 'https://www.horseracing.net/racecards'
base_url = "https://www.horseracing.net"
req = requests.get(todays_racecard_url)
soup_racecard = BeautifulSoup(req.content, 'html.parser')
df = pd.DataFrame(columns=['Runners', 'Tips', 'Tipsters'])
for h in soup_racecard.find_all('h3'):
a = h.find('a', href=True) # only find tags with href present
if a:
url = urljoin(base_url, a['href'])
print(url)
results = requests.get(url)
soup_url = BeautifulSoup(results.text, "html.parser")
for container in soup_url.find_all('div', class_='row-cell-right'):
runner_name = container.h5.a.text
tips_no = container.find('span', class_='tip-text number-tip').text if container.find('span', class_='tip-text number-tip') else ''
tipster_names = container.find('span', class_='pointers-text currency-text').text if container.find('span', class_='pointers-text currency-text') else ''
row = [runner_name, tips_no, tipster_names]
df.loc[len(df)] = row # append the new row
df['Tipsters'] = df['Tipsters'].str.replace(' - ', '')
df.to_csv('NewspaperTips.csv', index=False)
Giving you a CSV starting:
Runners,Tips,Tipsters
Ajrad,2,NEWMARKET
Royal Tribute,1,The Times
Time Interval,1,Daily Mirror
Hemsworth,1,Daily Express
Ancient Times,,
Final Watch,,
Hala Joud,,
May Night,1,The Star
Tell'Em Nowt,,
I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)
I'm looking to scrape this link, with just two simple pieces of information, but I don't know why I have this result and it can't give me all the data I search for:
particulier_allinfo particulier_tel 0 ABEL KEVIN10 RUE VIRGILE67200 Strasbourg
This is the code, thanks for your help :
import bs4 as bs
import urllib
import urllib.request
import requests
from bs4 import BeautifulSoup
import pandas
from pandas import DataFrame
import csv
with open('test_bs_118000.csv', mode='w') as csv_file:
fieldnames = ['AllInfo', 'Tel']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
particulier_allinfo = []
particulier_tel = []
i=1
historyurl= "https://www.118000.fr/search?part=1&who=kevin&page=" + str(i)
historypage= urllib.request.urlopen(historyurl)
soup=bs.BeautifulSoup(historypage,'html.parser')
cat=1
for category in soup.findAll('a',{'class':'clickable atel'}):
print(cat)
print(category.text)
cat=cat+1
q=1
for freru in soup.findAll('div',{'class':'cardbanner'}):
print(q)
print(freru.text)
q=q+1
#creating the data frame and populating its data into the csv file
data = {'particulier_allinfo':[freru.text], 'particulier_tel':[category.text]}
df = DataFrame(data, columns = ['particulier_allinfo', 'particulier_tel'])
print(df)
I am also trying to do a pagination for this code since the url lasts with "page=1,page=2,...,page=n".
If you can also help me in this, it would be very nice !
I am looking for it since last week, please help !
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def main(url):
with requests.Session() as req:
data = []
for page in range(1, 11):
print(f"Extracting Page# {page}")
r = req.get(url.format(page))
soup = bs(r.content, 'html.parser')
names = [name.text for name in soup.select("h2.name.title.inbl")]
phone = [ph.group(1) for ph in re.finditer(
r'mainLine":"(\d+)', r.text)]
for x, y in zip(names, phone):
if y.startswith(("06", "07")):
data.append([x, y])
df = pd.DataFrame(data, columns=["Name", "Phone"])
print(df)
df.to_csv("data.csv", index=False)
print("Data Saved to data.csv")
main("https://www.118000.fr/search?part=1&who=kevin&page={}")
Output: View-Online
Sample:
I feel like i'm close to getting there. But can't think of a way to make this lists into a .csv. Could anybody help?
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
column_names = [item.get_text() for item in tables[0].find_all('th')]
content = [item.get_text() for item in tables[0].find_all('td')]
df = pd.DataFrame(columns=column_names)
Try with this :
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
#column_names = [item.get_text() for item in tables[0].find_all('th')]
alltables=pd.DataFrame()
for x in tables:
df = pd.read_html(str(x))
alltables=alltables.append(df,ignore_index=True)
print(alltables)
#appended_data = pd.concat(appended_data)
alltables.to_csv('test.csv')