I am unsure how to add the column data to the empty dataframe.
Here is my code.
!pip install bs4
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_largest_banks?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkPY0221ENSkillsNetwork23455645-2021-01-01'
r = requests.get(url)
data = r.content
from bs4 import BeautifulSoup
soup= BeautifulSoup(data, 'html.parser')
data = pd.DataFrame(columns=["Name", "Market Cap (US$ Billion)"])
import pandas as pd
for row in soup.find_all('tbody')[3].find_all('tr'):
col = row.find_all('td')
When I use this code
data.head(5)
I retrieve an empty Dataframe.
This was a lot of digging and troubleshooting... but I keep forgetting that its always the case with webscraping.
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_largest_banks?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkPY0221ENSkillsNetwork23455645-2021-01-01'
r = requests.get(url)
data = r.content
soup = BeautifulSoup(data, 'html.parser')
data = {
'Name': [],
'Market Cap (US$ Billion)': []
}
x = soup.find_all('tbody')[3].find_all('tr')
for i in range(1, len(x)):
data['Name'].append(str(x[i].find_all('td')[1].find_all('a')[1]).split('>')[1][:-3])
data['Market Cap (US$ Billion)'].append(str(x[i].find_all('td')[2])[4:-6])
df = pd.DataFrame(data)
df.head()
Related
I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)
I've written a code which scrapes the contact information from a webpage using BeautifulSoup and a pre-designed library CommonRegex which is basically regular expressions to scrape US address information.While I'm able to extract the information which is in the form of a list and convert it into pandas dataframe, I'm not able to save all the values present in a list. This is the code I've written:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('p'):
df = CommonRegex()
df1 = df.street_addresses(link.get_text())
df2 = df.phones(link.get_text())
df3 = df.emails(link.get_text())
for i in df1:
dfr = pd.DataFrame([i], columns = ['Address'])
for j in df2:
dfr1 = pd.DataFrame([j], columns = ['Phone_no'])
dfr1['Phone_no'] = dfr1['Phone_no'].str.cat(sep=', ')
dfr1.drop_duplicate(inplace = True)
for k in df3:
dfr2 = pd.DataFrame([k], columns = ['Email'])
dfc = pd.concat([dfr, dfr1, dfr2], axis = 1)
This is the result I'm getting:-
But, since the regular expressions is extracting 3 values for Phone no, namely,
The result should be like this:-
I've no clue how to solve this issue, would be great if you guys could help me.
This should do:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
dict_data = {'address':[], 'phone_no': [], 'email': []
}
crex = CommonRegex()
for link in soup.find_all('p'):
str_add = crex.street_addresses(link.get_text())
phone = crex.phones(link.get_text())
email = crex.emails(link.get_text())
if str_add:
dict_data['address'].append(str_add[0])
if phone:
dict_data['phone_no'].append(', '.join(phone))
if email:
dict_data['email'].append(email[0])
df = pd.DataFrame(dict_data)
I am having trouble finding a table while web scraping using python/Beautiful Soup
import requests
from bs4 import BeautifulSoup
url = 'https://www.espn.com/nba/player/gamelog/_/id/3907387/ben-simmons'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
t = soup.find_all('table' , class_='Table Table--align-right')
This is returning null
You can use pandas read_html to read the table as a list and find the required list from that as below:
import requests
import pandas as pd
url = 'https://www.espn.com/nba/player/gamelog/_/id/3907387/ben-simmon'
html = requests.get(url).content
df_list = pd.read_html(html)
df = df_list[3]
print(df)
I feel like i'm close to getting there. But can't think of a way to make this lists into a .csv. Could anybody help?
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
column_names = [item.get_text() for item in tables[0].find_all('th')]
content = [item.get_text() for item in tables[0].find_all('td')]
df = pd.DataFrame(columns=column_names)
Try with this :
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
#column_names = [item.get_text() for item in tables[0].find_all('th')]
alltables=pd.DataFrame()
for x in tables:
df = pd.read_html(str(x))
alltables=alltables.append(df,ignore_index=True)
print(alltables)
#appended_data = pd.concat(appended_data)
alltables.to_csv('test.csv')
I tested my code with jupiter notebook with this code
...
rname = soup.find('p', 'con_tx')
#rnamelis = rname.findAll('p')
rname
from urllib.request import urljoin
story=[]
#review_text = lis[0].find('p').getText()
#list_soup =soup.find_all('p', 'con_tx')
story=rname.getText()
story
and it worked well.
(result) '전 여친에 ...'
But when I tried to scrape multiple pages
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import urljoin
import pandas as pd
import numpy as np
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
url = base_url + pages[0]
story = []
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
res.raise_for_status()
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#print(soup.find('p', 'con_tx'))
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = {story}
df = pd.DataFrame(data)
df.head()
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')
An error message came out.
ValueError: DataFrame constructor not properly called!
How do I fix my code?
Crawling area
Not sure what you are trying to do, but one thing I'm noticing is you are overwriting your dataframe each time. Also don;t know why you initialise story as a list, and then store it as a dictionary in the loop.
from bs4 import BeautifulSoup
import pandas as pd
import requests
base_url = 'https://movie.naver.com/movie/bi/mi/basic.nhn?code='
pages =['177374','164102']
df = pd.DataFrame()
for n in pages:
# Create url
url = base_url + n
# Parse data using BS
print('Downloading page %s...' % url)
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
rname = soup.find('p', 'con_tx')
story=rname.getText()
data = [story]
df = df.append(pd.DataFrame(data), sort=True).reset_index(drop=True)
df.to_csv('./moviestory.csv', sep=',', encoding='EUC-KR')