I feel like i'm close to getting there. But can't think of a way to make this lists into a .csv. Could anybody help?
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
column_names = [item.get_text() for item in tables[0].find_all('th')]
content = [item.get_text() for item in tables[0].find_all('td')]
df = pd.DataFrame(columns=column_names)
Try with this :
import requests
from bs4 import BeautifulSoup
import pandas as pd
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_mass_shootings_in_the_United_States')
soup = BeautifulSoup(wiki.content, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
#column_names = [item.get_text() for item in tables[0].find_all('th')]
alltables=pd.DataFrame()
for x in tables:
df = pd.read_html(str(x))
alltables=alltables.append(df,ignore_index=True)
print(alltables)
#appended_data = pd.concat(appended_data)
alltables.to_csv('test.csv')
Related
I'm using beautifulSoup to extract some data off of a wiki, but I can only get the first data of a specific column. If my understanding of for-loops is correct, it should loop through everything in the table. I tested this by printing "t" to the console and it shows all the data in HTML format. Is there a reason why this is happening?
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
table_id = "wikitable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
#table = soup.find('table', class_="wikitable")
table = soup.find_all('table', class_="wikitable")
with open('chinesewords.csv', 'w', encoding='utf8', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table:
simplified = t.find('span', class_="Hans").text
print(simplified)
The output:
δΈ€
(I apologize in advance if I didn't follow the rules of StackOverflow posting, as this is my first time posting a question)
Make your life easier and try pandas.read_html().
Here's an example:
import requests
import pandas as pd
table = (
pd
.read_html(
requests
.get(
"https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
).text,
flavor="lxml"
)[0]
)
table.to_csv("mandarin_frequency_lists.csv", index=False)
Output:
If you mean data from one column from the table, the following code is enough. I hope I helped:
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
table_column = soup.find_all('span', class_="Hans")
with open('chinesewords.csv', 'w', encoding='utf32', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table_column:
simplified = t.text
print(simplified)
writer.writerow(simplified)
I need to scrape content that is inside a div class inside another div class which repeats so I needed to use a find_all to get them. I want to be able to get them in text so when I put them in a dataframe it says the name of the object inside as you would get when you do a find(...).text instead of the entire html line
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.grammy.com/grammys/awards/winners-nominees/138'
page = requests.get(url).text
soup = BeautifulSoup(page,'lxml')
category = soup.find_all('div', class_ = "view-grouping-content")
print(len(category))
for c in category:
artistName = c.find_all('div', class_ = "views-field views-field-field-description")
import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
goal = [x.text for x in soup.select(
'.freelink.freelink-nid.freelink-internal')]
print(goal)
main('https://www.grammy.com/grammys/awards/winners-nominees/138')
Another angle of attack (if this could be from another site)...
import pandas as pd
import requests
url = r'https://en.wikipedia.org/wiki/Grammy_Award_for_Record_of_the_Year'
page = requests.get(url)
tables = pd.read_html(page.text)
df = pd.concat(tables[1:9])
df.dropna(thresh=3, inplace=True)
df = df.rename(columns={'Year[I]':'Year'})
df['Year'] = df['Year'].str.replace('\[\d+\]', '', regex=True)
df['Record'] = df['Record'].str.replace('"', '', regex=False)
print(df)
Outputs:
I'm trying to transfer the data of a long table (24 pages) to a Pandas Dataframe, but facing some issues with (i think) the for-loop code.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
res = requests.get(base_url.format('1'))
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
The Dataframe comes out as a repetition of the first page only, rather than a copy of all the data in the table.
I agree with #mxbi.
Try it:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://scrapethissite.com/pages/forms/?page_num={}'
l = []
for n in range(1, 25):
scrape_url = base_url.format(n)
res = requests.get(scrape_url)
soup = BeautifulSoup(res.text, 'lxml')
table = soup.select('table.table')[0]
columns = table.find('tr').find_all('th')
columns_names = [str(c.get_text()).strip() for c in columns]
table_rows = table.find_all('tr', class_='team')
for tr in table_rows:
td = tr.find_all('td')
row = [str(tr.get_text()).strip() for tr in td]
l.append(row)
df = pd.DataFrame(l, columns=columns_names)
requests is needed because the server wants an user-agent header and pandas read_html doesn't allow for that. As you still want to use pandas to generate the dataframe you could gain some efficiency through using multiprocessing to handle the requests, and within an user defined function extract the table of interest and pass as string to read_html. You will get a list of dataframes which can be combined with pandas concat.
Note: This can't be run from within Jupyter as will block.
import pandas as pd
from multiprocessing import Pool, cpu_count
import requests
from bs4 import BeautifulSoup as bs
def get_table(url:str)-> pd.DataFrame:
soup = bs(requests.get(url).text, 'lxml')
df = pd.read_html(str(soup.select_one('.table')))[0]
df['page_num'] = url.split("=")[-1]
return df
if __name__ == '__main__':
urls = [f'https://scrapethissite.com/pages/forms/?page_num={i}' for i in range(1, 25)]
with Pool(cpu_count()-1) as p:
results = p.map(get_table, urls)
final = pd.concat(results)
print(final)
# final.to_csv('data.csv', index = False, encoding = 'utf-8-sig')
I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)
I've written a code which scrapes the contact information from a webpage using BeautifulSoup and a pre-designed library CommonRegex which is basically regular expressions to scrape US address information.While I'm able to extract the information which is in the form of a list and convert it into pandas dataframe, I'm not able to save all the values present in a list. This is the code I've written:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('p'):
df = CommonRegex()
df1 = df.street_addresses(link.get_text())
df2 = df.phones(link.get_text())
df3 = df.emails(link.get_text())
for i in df1:
dfr = pd.DataFrame([i], columns = ['Address'])
for j in df2:
dfr1 = pd.DataFrame([j], columns = ['Phone_no'])
dfr1['Phone_no'] = dfr1['Phone_no'].str.cat(sep=', ')
dfr1.drop_duplicate(inplace = True)
for k in df3:
dfr2 = pd.DataFrame([k], columns = ['Email'])
dfc = pd.concat([dfr, dfr1, dfr2], axis = 1)
This is the result I'm getting:-
But, since the regular expressions is extracting 3 values for Phone no, namely,
The result should be like this:-
I've no clue how to solve this issue, would be great if you guys could help me.
This should do:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
dict_data = {'address':[], 'phone_no': [], 'email': []
}
crex = CommonRegex()
for link in soup.find_all('p'):
str_add = crex.street_addresses(link.get_text())
phone = crex.phones(link.get_text())
email = crex.emails(link.get_text())
if str_add:
dict_data['address'].append(str_add[0])
if phone:
dict_data['phone_no'].append(', '.join(phone))
if email:
dict_data['email'].append(email[0])
df = pd.DataFrame(dict_data)