BeautifulSoup: Scraping CSV list of URLs

BeautifulSoup: Scraping CSV list of URLs - python

I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1

import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)

Related

Can't get all the data of one column using BeautifulSoup

I'm using beautifulSoup to extract some data off of a wiki, but I can only get the first data of a specific column. If my understanding of for-loops is correct, it should loop through everything in the table. I tested this by printing "t" to the console and it shows all the data in HTML format. Is there a reason why this is happening?
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
table_id = "wikitable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
#table = soup.find('table', class_="wikitable")
table = soup.find_all('table', class_="wikitable")
with open('chinesewords.csv', 'w', encoding='utf8', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table:
simplified = t.find('span', class_="Hans").text
print(simplified)
The output:
一
(I apologize in advance if I didn't follow the rules of StackOverflow posting, as this is my first time posting a question)

Make your life easier and try pandas.read_html().
Here's an example:
import requests
import pandas as pd
table = (
pd
.read_html(
requests
.get(
"https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
).text,
flavor="lxml"
)[0]
)
table.to_csv("mandarin_frequency_lists.csv", index=False)
Output:

If you mean data from one column from the table, the following code is enough. I hope I helped:
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
table_column = soup.find_all('span', class_="Hans")
with open('chinesewords.csv', 'w', encoding='utf32', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table_column:
simplified = t.text
print(simplified)
writer.writerow(simplified)

From XML url to Pandas dataframe

I'm new to Python and I'm having some trouble importing a simple XML file from the web and converting it into a pandas DF:
https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml
I tried several methods, including using BS4, but I didn't get to make them work.
from bs4 import BeautifulSoup
import requests
socket = requests.get('https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml')
soup = bs4.BeautifulSoup(socket.content, ['lxml', 'xml'])
all_obs = soup.find_all('Obs')
l = []
df = pd.DataFrame(columns=['TIME_PERIOD','OBS_VALUE'])
pos= 0
for obs in all_obs:
l.append(obs.find('TIME_PERIOD').text)
l.append(obs.find('OBS_VALUE').text)
df.loc[pos] = l
l = []
pos+=1
print(df)
Could someone help me?
Thanks

Aiii!
from bs4 import BeautifulSoup
import requests
import pandas as pd
response = requests.get('https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml')
bs = BeautifulSoup(response.text, ['xml'])
obs = bs.find_all("Obs")
#<Obs OBS_CONF="F" OBS_STATUS="A" OBS_VALUE="10.7255" TIME_PERIOD="2005-04-01"/>
df = pd.DataFrame(columns=['TIME_PERIOD','OBS_VALUE'])
for node in obs:
df = df.append({'TIME_PERIOD': node.get("TIME_PERIOD"), 'OBS_VALUE': node.get("OBS_VALUE")}, ignore_index=True)
df.head()

Failing to create the data frame and populating its data into the csv file properly

I'm looking to scrape this link, with just two simple pieces of information, but I don't know why I have this result and it can't give me all the data I search for:
particulier_allinfo particulier_tel 0 ABEL KEVIN10 RUE VIRGILE67200 Strasbourg
This is the code, thanks for your help :
import bs4 as bs
import urllib
import urllib.request
import requests
from bs4 import BeautifulSoup
import pandas
from pandas import DataFrame
import csv
with open('test_bs_118000.csv', mode='w') as csv_file:
fieldnames = ['AllInfo', 'Tel']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
particulier_allinfo = []
particulier_tel = []
i=1
historyurl= "https://www.118000.fr/search?part=1&who=kevin&page=" + str(i)
historypage= urllib.request.urlopen(historyurl)
soup=bs.BeautifulSoup(historypage,'html.parser')
cat=1
for category in soup.findAll('a',{'class':'clickable atel'}):
print(cat)
print(category.text)
cat=cat+1
q=1
for freru in soup.findAll('div',{'class':'cardbanner'}):
print(q)
print(freru.text)
q=q+1
#creating the data frame and populating its data into the csv file
data = {'particulier_allinfo':[freru.text], 'particulier_tel':[category.text]}
df = DataFrame(data, columns = ['particulier_allinfo', 'particulier_tel'])
print(df)
I am also trying to do a pagination for this code since the url lasts with "page=1,page=2,...,page=n".
If you can also help me in this, it would be very nice !
I am looking for it since last week, please help !

import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def main(url):
with requests.Session() as req:
data = []
for page in range(1, 11):
print(f"Extracting Page# {page}")
r = req.get(url.format(page))
soup = bs(r.content, 'html.parser')
names = [name.text for name in soup.select("h2.name.title.inbl")]
phone = [ph.group(1) for ph in re.finditer(
r'mainLine":"(\d+)', r.text)]
for x, y in zip(names, phone):
if y.startswith(("06", "07")):
data.append([x, y])
df = pd.DataFrame(data, columns=["Name", "Phone"])
print(df)
df.to_csv("data.csv", index=False)
print("Data Saved to data.csv")
main("https://www.118000.fr/search?part=1&who=kevin&page={}")
Output: View-Online
Sample:

How to save all the scraped data from a website in a pandas dataframe?

I've written a code which scrapes the contact information from a webpage using BeautifulSoup and a pre-designed library CommonRegex which is basically regular expressions to scrape US address information.While I'm able to extract the information which is in the form of a list and convert it into pandas dataframe, I'm not able to save all the values present in a list. This is the code I've written:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('p'):
df = CommonRegex()
df1 = df.street_addresses(link.get_text())
df2 = df.phones(link.get_text())
df3 = df.emails(link.get_text())
for i in df1:
dfr = pd.DataFrame([i], columns = ['Address'])
for j in df2:
dfr1 = pd.DataFrame([j], columns = ['Phone_no'])
dfr1['Phone_no'] = dfr1['Phone_no'].str.cat(sep=', ')
dfr1.drop_duplicate(inplace = True)
for k in df3:
dfr2 = pd.DataFrame([k], columns = ['Email'])
dfc = pd.concat([dfr, dfr1, dfr2], axis = 1)
This is the result I'm getting:-
But, since the regular expressions is extracting 3 values for Phone no, namely,
The result should be like this:-
I've no clue how to solve this issue, would be great if you guys could help me.

This should do:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
dict_data = {'address':[], 'phone_no': [], 'email': []
}
crex = CommonRegex()
for link in soup.find_all('p'):
str_add = crex.street_addresses(link.get_text())
phone = crex.phones(link.get_text())
email = crex.emails(link.get_text())
if str_add:
dict_data['address'].append(str_add[0])
if phone:
dict_data['phone_no'].append(', '.join(phone))
if email:
dict_data['email'].append(email[0])
df = pd.DataFrame(dict_data)

Update column with data of FOR LOOP Pandas

I have the below code, I need that the output of the print update a new column.
import pandas as pd
import re
import numpy as np
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('IR006.csv')
pd.set_option('display.max_colwidth', -1)
df4 = pd.read_csv('IR006.csv')
df4['UPDATE'] = "" **#This is the column where i wanna see the output of the for loop**
So, here is the loop that fetch data from URL:
for link in df4.iterrows():
url = link[1]['URL'].replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
match = ''.join(re.findall(r"[C][L]\S{8}", str(soup)))
print(match)
Output:
CLdbDQgFdD
CLYwHQYDVR
CLYwHQYDVR
CLYwHQYDVR
CLYwHQYDVR
The Dataframe look like this:
So how I can put the data that generates the loop in a new column name "UPDATE"

Try the following code:
for idx,row in df4.iterrows():
url = row['URL'].replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
match = ''.join(re.findall(r"[C][L]\S{8}", str(soup)))
df4.at[idx,'UPDATE'] = match

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

BeautifulSoup: Scraping CSV list of URLs - python

Related

Can't get all the data of one column using BeautifulSoup

From XML url to Pandas dataframe

Failing to create the data frame and populating its data into the csv file properly

How to save all the scraped data from a website in a pandas dataframe?

Update column with data of FOR LOOP Pandas

Categories

Resources