Update column with data of FOR LOOP Pandas - python

I have the below code, I need that the output of the print update a new column.
import pandas as pd
import re
import numpy as np
import urllib.parse
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('IR006.csv')
pd.set_option('display.max_colwidth', -1)
df4 = pd.read_csv('IR006.csv')
df4['UPDATE'] = "" **#This is the column where i wanna see the output of the for loop**
So, here is the loop that fetch data from URL:
for link in df4.iterrows():
url = link[1]['URL'].replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
match = ''.join(re.findall(r"[C][L]\S{8}", str(soup)))
print(match)
Output:
CLdbDQgFdD
CLYwHQYDVR
CLYwHQYDVR
CLYwHQYDVR
CLYwHQYDVR
The Dataframe look like this:
So how I can put the data that generates the loop in a new column name "UPDATE"

Try the following code:
for idx,row in df4.iterrows():
url = row['URL'].replace('/v01/', '/depot/')
x = urlopen(url)
new = x.read()
soup = BeautifulSoup(new, "lxml-xml")
match = ''.join(re.findall(r"[C][L]\S{8}", str(soup)))
df4.at[idx,'UPDATE'] = match

Related

From XML url to Pandas dataframe

I'm new to Python and I'm having some trouble importing a simple XML file from the web and converting it into a pandas DF:
https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml
I tried several methods, including using BS4, but I didn't get to make them work.
from bs4 import BeautifulSoup
import requests
socket = requests.get('https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml')
soup = bs4.BeautifulSoup(socket.content, ['lxml', 'xml'])
all_obs = soup.find_all('Obs')
l = []
df = pd.DataFrame(columns=['TIME_PERIOD','OBS_VALUE'])
pos= 0
for obs in all_obs:
l.append(obs.find('TIME_PERIOD').text)
l.append(obs.find('OBS_VALUE').text)
df.loc[pos] = l
l = []
pos+=1
print(df)
Could someone help me?
Thanks
Aiii!
from bs4 import BeautifulSoup
import requests
import pandas as pd
response = requests.get('https://www.ecb.europa.eu/stats/policy_and_exchange_rates/euro_reference_exchange_rates/html/cny.xml')
bs = BeautifulSoup(response.text, ['xml'])
obs = bs.find_all("Obs")
#<Obs OBS_CONF="F" OBS_STATUS="A" OBS_VALUE="10.7255" TIME_PERIOD="2005-04-01"/>
df = pd.DataFrame(columns=['TIME_PERIOD','OBS_VALUE'])
for node in obs:
df = df.append({'TIME_PERIOD': node.get("TIME_PERIOD"), 'OBS_VALUE': node.get("OBS_VALUE")}, ignore_index=True)
df.head()

BeautifulSoup: Scraping CSV list of URLs

I have been trying to download data from different urls and then save it to a csv file.
The idea is extract the highlighted data from: https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
So far I built the following piece of code:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
url_is = 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow'
read_data = ur.urlopen(url_is).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
data=[cell.text for cell in row.parent.select('td') if cell.text!='']
df=pd.DataFrame(data)
print(df.T)
I get as an output:
All good so far.
Now my idea is to extract specific classes from multiple URLs, keep the same headers from the website and export it to a .csv.
The tags and classes stay the same
Sample URLs:
https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow
https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow
Code (I wanted to try with 2 columns: 2015 and 2016)
As desidered ouput I would like something like:
I wrote the following code, but is giving me issues, any help or advice is welcome:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as ur
import numpy as np
import requests
links = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow', 'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
container = pd.DataFrame(columns=['Name', 'Name2'])
pos=0
for l in links:
read_data = ur.urlopen(l).read()
soup_is=BeautifulSoup(read_data, 'lxml')
row = soup_is.select_one('tr.mainRow>td.rowTitle:contains("Cash Dividends Paid - Total")')
results=[cell.text for cell in row.parent.select('td') if cell.text!='']
records = []
for result in results:
records = []
Name = result.find('span', attrs={'itemprop':'2015'}).text if result.find('span', attrs={'itemprop':'2015'}) is not None else ''
Name2 = result.find('span', attrs={'itemprop':'2016'}).text if result.find('span', attrs={'itemprop':'2016'}) is not None else ''
records.append(Name)
records.append(Name2)
container.loc[pos] = records
pos+=1
import requests
import pandas as pd
urls = ['https://www.marketwatch.com/investing/stock/aapl/financials/cash-flow',
'https://www.marketwatch.com/investing/stock/MMM/financials/cash-flow']
def main(urls):
with requests.Session() as req:
goal = []
for url in urls:
r = req.get(url)
df = pd.read_html(
r.content, match="Cash Dividends Paid - Total")[0].iloc[[0], 0:3]
goal.append(df)
new = pd.concat(goal)
print(new)
main(urls)

How do I write my data along the columns in a CSV file?

When I write to the csv file all of my data is printed in only the first column. Using my loop, how do I iterate along the columns to write the data?
import csv
import bs4
import urllib
from urllib.request import urlopen as uReq
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
#For sites that can't be opened due to Urllib blocker, use a Mozilla User agent to get access
pageRequest = Request('https://coronavirusbellcurve.com/', headers = {'User-Agent': 'Mozilla/5.0'})
htmlPage = urlopen(pageRequest).read()
page_soup = soup(htmlPage, 'html.parser')
specificDiv = page_soup.find("div", {"class": "table-responsive-xl"})
TbodyStats = specificDiv.table.tbody.tr.contents
TbodyDates = specificDiv.table.thead.tr.contents
with open('CovidHTML.csv','w', newline= '') as file:
theWriter = csv.writer(file)
theWriter.writerow(['5/4', ' 5/5', ' 5/6',' 5/7',' 5/8',' 5/9'])
for i in range(3,len(TbodyStats)):
if i%2 != 0:
theWriter.writerow([TbodyStats[i].text])
Another method, For reference only.
from simplified_scrapy import SimplifiedDoc,utils,req
html = req.get('https://coronavirusbellcurve.com/')
doc = SimplifiedDoc(html)
specificDiv = doc.select('div.table-responsive-xl') # Get first div. If you want to get all divs, use this method: doc.selects('div.table-responsive-xl')
# TbodyStats = specificDiv.tbody.trs.selects('td|th').text # Get data
# TbodyDates = specificDiv.thead.trs.selects('td|th').text # Get date
data = specificDiv.table.trs.selects('td|th').text # Get all
rows = []
for row in data:
rows.append(row[1:])
utils.save2csv('test.csv',rows)
Result:
5/5,5/6,5/7,5/8,5/9
1213260,1237960,1266822,1294664,1314610
24423,24700,28862,27842,19946
2.05%,2.04%,2.33%,2.20%,1.54%
I think you may be able to do this (I can't test for sure because I don't have your exact data on hand):
row = []
for i in range(3, len(TbodyStats), 2):
row.append(TbodyStats[i].text)
if len(row) == 6:
theWriter.writerow(row)
row = []
I added the 'step' to your range so you don't have to use % for finding odd numbered indices, then just built each row until it hits 6 members, then flush that to the csv file, then empty the row so you can repeat the process.

How to save all the scraped data from a website in a pandas dataframe?

I've written a code which scrapes the contact information from a webpage using BeautifulSoup and a pre-designed library CommonRegex which is basically regular expressions to scrape US address information.While I'm able to extract the information which is in the form of a list and convert it into pandas dataframe, I'm not able to save all the values present in a list. This is the code I've written:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('p'):
df = CommonRegex()
df1 = df.street_addresses(link.get_text())
df2 = df.phones(link.get_text())
df3 = df.emails(link.get_text())
for i in df1:
dfr = pd.DataFrame([i], columns = ['Address'])
for j in df2:
dfr1 = pd.DataFrame([j], columns = ['Phone_no'])
dfr1['Phone_no'] = dfr1['Phone_no'].str.cat(sep=', ')
dfr1.drop_duplicate(inplace = True)
for k in df3:
dfr2 = pd.DataFrame([k], columns = ['Email'])
dfc = pd.concat([dfr, dfr1, dfr2], axis = 1)
This is the result I'm getting:-
But, since the regular expressions is extracting 3 values for Phone no, namely,
The result should be like this:-
I've no clue how to solve this issue, would be great if you guys could help me.
This should do:
import pandas as pd
from commonregex import CommonRegex
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.thetaxshopinc.com/pages/contact-tax-accountant-brampton'
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
dict_data = {'address':[], 'phone_no': [], 'email': []
}
crex = CommonRegex()
for link in soup.find_all('p'):
str_add = crex.street_addresses(link.get_text())
phone = crex.phones(link.get_text())
email = crex.emails(link.get_text())
if str_add:
dict_data['address'].append(str_add[0])
if phone:
dict_data['phone_no'].append(', '.join(phone))
if email:
dict_data['email'].append(email[0])
df = pd.DataFrame(dict_data)

Saving multiple data frames from loop

I have been searching for a solution to my problem, but all answers I find uses print() at the end of the answer, and NOT saving the data frames as I would like to.
Below I have a (almost) functioning code that prints 3 seperate tables. How do I save these three tables in 3 seperate data frames with the names matches_october, matches_november and matches_december?
The last line in my code is not working as I want it to work. I hope it is clear what I would like the code to do (Saving a data frame at the end of each of the 3 rounds in the loop)
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches + valid_pages = df[0]
You can case it, but that's not very robust (and it's rather ugly).
if i == 'october':
matches_october = pd.read_html(str(table))
if i == 'november':
# so on and so forth
A more elegant solution is to use a dictionary. Before the loop, declare matches = {}. Then, in each iteration:
matches[i] = pd.read_html(str(table))
Then you can access the October matches DataFrame via matches['october'].
You can't compose variable names using +, try using a dict instead:
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
print(df)
matches[i] = df[0] # store it in the dict
Thanks guys. That worked! :)
import pandas as pd
import requests
from bs4 import BeautifulSoup
matches = {} # create an empty dict
base_url = 'https://www.basketball-reference.com/leagues/NBA_2019_games-'
valid_pages = ['october','november','december']
end = '.html'
for i in valid_pages:
url = '{}{}{}'.format(base_url, i, end)
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
matches[i] = df[0] # store it in the dict
matches_october = matches['october']

Categories