I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)
Related
I'm using beautifulSoup to extract some data off of a wiki, but I can only get the first data of a specific column. If my understanding of for-loops is correct, it should loop through everything in the table. I tested this by printing "t" to the console and it shows all the data in HTML format. Is there a reason why this is happening?
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
table_id = "wikitable"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
#table = soup.find('table', class_="wikitable")
table = soup.find_all('table', class_="wikitable")
with open('chinesewords.csv', 'w', encoding='utf8', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table:
simplified = t.find('span', class_="Hans").text
print(simplified)
The output:
δΈ€
(I apologize in advance if I didn't follow the rules of StackOverflow posting, as this is my first time posting a question)
Make your life easier and try pandas.read_html().
Here's an example:
import requests
import pandas as pd
table = (
pd
.read_html(
requests
.get(
"https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
).text,
flavor="lxml"
)[0]
)
table.to_csv("mandarin_frequency_lists.csv", index=False)
Output:
If you mean data from one column from the table, the following code is enough. I hope I helped:
from bs4 import BeautifulSoup
import requests, csv
import pandas as pd
wiki_url = "https://en.wiktionary.org/wiki/Appendix:Mandarin_Frequency_lists/1-1000"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
table_column = soup.find_all('span', class_="Hans")
with open('chinesewords.csv', 'w', encoding='utf32', newline='') as c:
writer = csv.writer(c)
writer.writerow(["simplified, pinyin"])
for t in table_column:
simplified = t.text
print(simplified)
writer.writerow(simplified)
I'm new to Python and I'm having some trouble importing a simple XML file from the web and converting it into a pandas DF:
https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml
I tried several methods, including using BS4, but I didn't get to make them work.
from bs4 import BeautifulSoup
import requests
socket = requests.get('https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml')
soup = bs4.BeautifulSoup(socket.content, ['lxml', 'xml'])
all_obs = soup.find_all('Obs')
l = []
df = pd.DataFrame(columns=['TIME_PERIOD','OBS_VALUE'])
pos= 0
for obs in all_obs:
l.append(obs.find('TIME_PERIOD').text)
l.append(obs.find('OBS_VALUE').text)
df.loc[pos] = l
l = []
pos+=1
print(df)
Could someone help me?
Thanks
Aiii!
from bs4 import BeautifulSoup
import requests
import pandas as pd
response = requests.get('https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml')
bs = BeautifulSoup(response.text, ['xml'])
obs = bs.find_all("Obs")
#<Obs OBS_CONF="F" OBS_STATUS="A" OBS_VALUE="10.7255" TIME_PERIOD="2005-04-01"/>
df = pd.DataFrame(columns=['TIME_PERIOD','OBS_VALUE'])
for node in obs:
df = df.append({'TIME_PERIOD': node.get("TIME_PERIOD"), 'OBS_VALUE': node.get("OBS_VALUE")}, ignore_index=True)
df.head()
I'm looking to scrape this link, with just two simple pieces of information, but I don't know why I have this result and it can't give me all the data I search for:
particulier_allinfo particulier_tel 0 ABEL KEVIN10 RUE VIRGILE67200 Strasbourg
This is the code, thanks for your help :
import bs4 as bs
import urllib
import urllib.request
import requests
from bs4 import BeautifulSoup
import pandas
from pandas import DataFrame
import csv
with open('test_bs_118000.csv', mode='w') as csv_file:
fieldnames = ['AllInfo', 'Tel']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
particulier_allinfo = []
particulier_tel = []
i=1
historyurl= "https://www.118000.fr/search?part=1&who=kevin&page=" + str(i)
historypage= urllib.request.urlopen(historyurl)
soup=bs.BeautifulSoup(historypage,'html.parser')
cat=1
for category in soup.findAll('a',{'class':'clickable atel'}):
print(cat)
print(category.text)
cat=cat+1
q=1
for freru in soup.findAll('div',{'class':'cardbanner'}):
print(q)
print(freru.text)
q=q+1
#creating the data frame and populating its data into the csv file
data = {'particulier_allinfo':[freru.text], 'particulier_tel':[category.text]}
df = DataFrame(data, columns = ['particulier_allinfo', 'particulier_tel'])
print(df)
I am also trying to do a pagination for this code since the url lasts with "page=1,page=2,...,page=n".
If you can also help me in this, it would be very nice !
I am looking for it since last week, please help !
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
def main(url):
with requests.Session() as req:
data = []
for page in range(1, 11):
print(f"Extracting Page# {page}")
r = req.get(url.format(page))
soup = bs(r.content, 'html.parser')
names = [name.text for name in soup.select("h2.name.title.inbl")]
phone = [ph.group(1) for ph in re.finditer(
r'mainLine":"(\d+)', r.text)]
for x, y in zip(names, phone):
if y.startswith(("06", "07")):
data.append([x, y])
df = pd.DataFrame(data, columns=["Name", "Phone"])
print(df)
df.to_csv("data.csv", index=False)
print("Data Saved to data.csv")
main("https://www.118000.fr/search?part=1&who=kevin&page={}")
Output: View-Online
Sample:
I've written a code which scrapes the contact information from a webpage using BeautifulSoup and a pre-designed library CommonRegex which is basically regular expressions to scrape US address information.While I'm able to extract the information which is in the form of a list and convert it into pandas dataframe, I'm not able to save all the values present in a list. This is the code I've written:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('p'):
df = CommonRegex()
df1 = df.street_addresses(link.get_text())
df2 = df.phones(link.get_text())
df3 = df.emails(link.get_text())
for i in df1:
dfr = pd.DataFrame([i], columns = ['Address'])
for j in df2:
dfr1 = pd.DataFrame([j], columns = ['Phone_no'])
dfr1['Phone_no'] = dfr1['Phone_no'].str.cat(sep=', ')
dfr1.drop_duplicate(inplace = True)
for k in df3:
dfr2 = pd.DataFrame([k], columns = ['Email'])
dfc = pd.concat([dfr, dfr1, dfr2], axis = 1)
This is the result I'm getting:-
But, since the regular expressions is extracting 3 values for Phone no, namely,
The result should be like this:-
I've no clue how to solve this issue, would be great if you guys could help me.
This should do:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
dict_data = {'address':[], 'phone_no': [], 'email': []
}
crex = CommonRegex()
for link in soup.find_all('p'):
str_add = crex.street_addresses(link.get_text())
phone = crex.phones(link.get_text())
email = crex.emails(link.get_text())
if str_add:
dict_data['address'].append(str_add[0])
if phone:
dict_data['phone_no'].append(', '.join(phone))
if email:
dict_data['email'].append(email[0])
df = pd.DataFrame(dict_data)
I have the below code, I need that the output of the print update a new column.
import pandas as pd
import re
import numpy as np
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('IR006.csv')
pd.set_option('display.max_colwidth', -1)
df4 = pd.read_csv('IR006.csv')
df4['UPDATE'] = "" **#This is the column where i wanna see the output of the for loop**
So, here is the loop that fetch data from URL:
for link in df4.iterrows():
url = link[1]['URL'].replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
match = ''.join(re.findall(r"[C][L]\S{8}", str(soup)))
print(match)
Output:
CLdbDQgFdD
CLYwHQYDVR
CLYwHQYDVR
CLYwHQYDVR
CLYwHQYDVR
The Dataframe look like this:
So how I can put the data that generates the loop in a new column name "UPDATE"
Try the following code:
for idx,row in df4.iterrows():
url = row['URL'].replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
match = ''.join(re.findall(r"[C][L]\S{8}", str(soup)))
df4.at[idx,'UPDATE'] = match