I need to extract data from four strings that have been parsed in BeautifulSoup. They are:
Arkansas72.21:59 AM76.29:04 AM5.22977.37:59 AM
Ashley71.93:39 AM78.78:59 AM0.53678.78:59 AM
Bradley72.64:49 AM77.28:59 AM2.41877.28:49 AM
Chicot-40.19:04 AM-40.19:04 AM2.573-40.112:09 AM
The data from the first string, for example, is Arkansas, 72.1, 1:59 AM, 76.2, 9:04 AM, 5.2, 29, 77.3, and 7:59 AM. Is there a simple way to do this?
Edit: full code
import urllib2
from bs4 import BeautifulSoup
import time
def scraper():
#Arkansas State Plant Board Weather Web data
url1 = 'http://170.94.200.136/weather/Inversion.aspx'
#opens url and parses HTML into Unicode
page1 = urllib2.urlopen(url1)
soup1 = BeautifulSoup(page1, 'lxml')
#print(soup.get_text()) gives a single Unicode string of relevant data in strings from the url
#Without print(), returns everything in without proper spacing
sp1 = soup1.get_text()
#datasp1 is the chunk with the website data in it so the search for Arkansas doesn't return the header
#everything else finds locations for Unicode strings for first four stations
start1 = sp1.find('Today')
end1 = sp1.find('new Sys.')
datasp1 = sp1[start1:end1-10]
startArkansas = datasp1.find('Arkansas')
startAshley = datasp1.find('Ashley')
dataArkansas = datasp1[startArkansas:startAshley-2]
startBradley = datasp1.find('Bradley')
dataAshley = datasp1[startAshley:startBradley-2]
startChicot = datasp1.find('Chicot')
dataBradley = datasp1[startBradley:startChicot-2]
startCleveland = datasp1.find('Cleveland')
dataChicot = datasp1[startChicot:startCleveland-2]
print(dataArkansas)
print(dataAshley)
print(dataBradley)
print(dataChicot)
Just improve the way you extract the tabular data. I would use pandas.read_html() to read it into the dataframe which, I'm pretty sure, you would find convenient to work with:
import pandas as pd
df = pd.read_html("http://170.94.200.136/weather/Inversion.aspx", attrs={"id": "MainContent_GridView1"})[0]
print(df)
You need to use beautifulsoup to parse the html page and retrieve your data:
url1 = 'http://170.94.200.136/weather/Inversion.aspx'
#opens url and parses HTML into Unicode
page1 = urlopen(url1)
soup1 = BeautifulSoup(page1)
# get the table
table = soup1.find(id='MainContent_GridView1')
# find the headers
headers = [h.get_text() for h in table.find_all('th')]
# retrieve data
data = {}
tr_elems = table.find_all('tr')
for tr in tr_elems:
tr_content = [td.get_text() for td in tr.find_all('td')]
if tr_content:
data[tr_content[0]] = dict(zip(headers[1:], tr_content[1:]))
print(data)
This example will shows:
{
"Greene West": {
"Low Temp (\u00b0F)": "67.7",
"Time Of High": "10:19 AM",
"Wind Speed (MPH)": "0.6",
"High Temp (\u00b0F)": "83.2",
"Wind Dir (\u00b0)": "20",
"Time Of Low": "6:04 AM",
"Current Time": "10:19 AM",
"Current Temp (\u00b0F)": "83.2"
},
"Cleveland": {
"Low Temp (\u00b0F)": "70.8",
"Time Of High": "10:14 AM",
"Wind Speed (MPH)": "1.9",
[.....]
}
Related
I want to scrape specific data from guru focus website.
https://www.gurufocus.com/stock/AAHTF/summary?search=AAPICO
Currently i am fetching number value. For example:financial strength value is "4" out of 10. Now i want to fetch sub components data as well.
code to fetch only number value:
for name in names:
start_time = time.time()
# getting the symbol
URL = f'https://www.gurufocus.com/search?s={name}'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(URL)
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, 'html.parser')
headers = soup.find_all("span")
# saving only the first link
for i, head in enumerate(headers):
try:
h = head.find("a").get("href")
link = "https://www.gurufocus.com" + h
break
except:
pass
try:
# loading the link page
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, 'html.parser')
headers = soup.find_all("span", class_="t-default bold")
ratings = []
for head in headers:
ratings.append(int(head.get_text()))
if len(ratings) == 0:
continue
ratings_dict = {"Financial Strength": ratings[0],
"Growth Rank" : ratings[1],
"Momentum Rank" : ratings[2],
"Profitability Rank": ratings[3],
"GF Value Rank" : ratings[4],
}
print(ratings_dict)
# ratings_dict = json.loads(ratings_dict)
with open(f"output/gurufocus/{name}.json", 'w') as f:
json.dump(str(ratings_dict), f)
end_time = time.time()
print("time taken for %s is: %.2f" %(name, (end_time-start_time)))
except:
print("no data found")
output:
"{'Financial Strength': 6, 'Growth Rank': 4, 'Momentum Rank': 4, 'Profitability Rank': 7, 'GF Value Rank': 5}"
Expection:
I want to fetch full table data( below image) along with rank into data frame.
How do I need to change my code to obtain the other specific data?
You can use Pandas to write a clean solution for this problem:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from collections import ChainMap
tables = pd.read_html(
requests.get(
'https://www.gurufocus.com/stock/AAHTF/summary?search=AAPICO'
).text,
header=0
)
sub_table_values = [[{record["Name"]: record["Current"]} for record in json.loads(e)] for e in [i.to_json(orient="records") for i in tables]]
sub_formatted = [dict(ChainMap(*a)) for a in sub_table_values]
print(json.dumps(sub_formatted, indent=4))
Description:
First, I obtain all the tables and convert those to DataFrames(using pandas).
Then, I convert the dataframes to json and only extract the desired result(Name and Curent)
Format the data.
It would return:
[
{
"WACC vs ROIC": null,
"Beneish M-Score": "-2.28",
"Altman Z-Score": "1.98",
"Piotroski F-Score": "7/9",
"Interest Coverage": "4.68",
"Debt-to-EBITDA": "2.55",
"Debt-to-Equity": "0.85",
"Equity-to-Asset": "0.37",
"Cash-To-Debt": "0.1"
},
{
"Future 3-5Y Total Revenue Growth Rate": 13.71,
"3-Year Book Growth Rate": 2.8,
"3-Year FCF Growth Rate": 49.9,
"3-Year EPS without NRI Growth Rate": -5.2,
"3-Year EBITDA Growth Rate": 9.0,
"3-Year Revenue Growth Rate": 9.6
}...
]
However, this is solution works because the web is structured with tables. For complex/irregular websites I prefer to use scrapy as we use in my job.
Trying to scrape some weather data off of Weather Underground. I haven't had any difficulty getting the data of interest until I came to getting the day/date, hi/lo temps, and forecast (ie. "Partly Cloudy"). Each is in a div without a class. The parent, of each, is a div with a class="obs-date" (see image below)
[WxUn HTML image][1]
Attempted code below with other options commented out. Each returns an empty list.
def get_wx(city, state):
city=city.lower()
state=state.lower()
# get current conditions; 'weather' in url
current_dict = get_current(city, state)
# get forecast; 'forecast' in url
f_url = f'https://www.wunderground.com/forecast/us/{state}/{city}'
f_response = req.get(f_url)
f_soup = BeautifulSoup(f_response.text, 'html.parser')
cast_dates = f_soup.find_all('div', class_="obs-date")
# cast_dates = f_soup.find_all('div', attrs={"class":"obs-date"})
# cast_dates = f_soup.select('div.obs-date')
print(cast_dates)
get_wx("Portland", "ME")
Any help with what I'm missing is appreciated.
As far as I can see the whole block you're trying to parse is driven by javascript, that's why you're getting empty results using beautifulsoup
The ADDITIONAL CONDITIONS part could be parsed completely using bs4 as well as everything below. Table at the end could be parsed using pandas.
To scrape javascript content, you can use requests-html or selenium libraries.
from requests_html import HTMLSession
import json
session = HTMLSession()
url = "https://www.wunderground.com/weather/us/me/portland"
response = session.get(url)
response.html.render(sleep=1)
data = []
current_date = response.html.find('.timestamp strong', first = True).text
weather_conditions = response.html.find('.condition-icon p', first = True).text
gusts = response.html.find('.medium-uncentered span', first = True).text
current_temp = response.html.find('.current-temp .is-degree-visible', first = True).text
data.append({
"Last update": current_date,
"Current weather": weather_conditions,
"Temperature": current_temp,
"Gusts": gusts,
})
print(json.dumps(data, indent = 2, ensure_ascii = False))
Output:
[
{
"Last update": "1:27 PM EDT on April 14, 2021",
"Current weather": "Fair",
"Temperature": "49 F",
"Gusts": "13 mph"
}
]
<script type="text/javascript">
'sku': 'T3246B5',
'Name': 'TAS BLACKY',
'Price': '111930',
'categories': 'Tas,Wanita,Sling Bags,Di bawah Rp 200.000',
'brand': '',
'visibility': '4',
'instock': "1",
'stock': "73.0000"
</script>
I want to scrape the text between : 'stock': " and .0000" so the desireable result is 73
What I used to know is to do something like this:
for url2 in urls2:
req2 = Request(url2, headers={'User-Agent': 'Chrome/39.0.2171.95'})
html2 = uReq(req2).read()
page_soup2 = soup(html2, "html.parser")
# Grab text
stock = page_soup2.findAll("p", {"class": "stock"})
stocks = stock[0].text
I used something like this in my previous code, It works before the web change their code.
But now there is more than 1 ("script", {"type": "text/javascript"}) in the entire page I want to scrape. So I dont know how to find the right ("script", {"type": "text/javascript"})
I also don't know hot to get the specific text before and after the text.
I have googled it all this day but can't find the solution. Please help.
I found that strings = 'stock': " and .0000" is unique in the entire page, only 1 'stock': and only 1 .0000"
So I think it could be the sign of location where I want to scrape the text.
Please help, thank you for your kindness.
I also apologize for my lack of English, and I am actually unfamiliar with programming. I'm just trying to learn from Google, but I can't find the answer. Thank you for your understanding.
the url = view-source:sophieparis.com/blacky-bag.html
Since you are sure 'stock' only shows up in the script tag you want, you can pull out that text that contains 'stock. Once you have that, it's a matter of trimming off the excess, and change to double quotes to get it into a valid json format and then simply read that in using json.loads()
import requests
from bs4 import BeautifulSoup
import json
url2 = 'https://www.sophieparis.com/blacky-bag.html'
req2 = requests.get(url2, headers={'User-Agent': 'Chrome/39.0.2171.95'})
page_soup2 = BeautifulSoup(req2.text, "html.parser")
scripts = page_soup2.find_all('script')
for script in scripts:
if 'stock' in script.text:
jsonStr = script.text
jsonStr = jsonStr.split('productObject = ')[-1].strip()
jsonStr = jsonStr.rsplit('}',1)[0].strip() + '}'
jsonData = json.loads(jsonStr.replace("'",'"'))
print (jsonData['stock'].split('.')[0])
Output:
print (jsonData['stock'].split('.')[0])
71
You could also do this without the loop and just grab the script that has the string stock in it using 1 line:
jsonStr = page_soup2.find('script', text=re.compile(r'stock')).text
Full code would look something like:
import requests
from bs4 import BeautifulSoup
import json
import re
url2 = 'https://www.sophieparis.com/blacky-bag.html'
req2 = requests.get(url2, headers={'User-Agent': 'Chrome/39.0.2171.95'})
page_soup2 = BeautifulSoup(req2.text, "html.parser")
jsonStr = page_soup2.find('script', text=re.compile(r'stock')).text
jsonStr = jsonStr.split('productObject = ')[-1].strip()
jsonStr = jsonStr.rsplit('}',1)[0].strip() + '}'
jsonData = json.loads(jsonStr.replace("'",'"'))
print (jsonData['stock'].split('.')[0])
I would write a regex that targets the javascript dictionary variable that houses the values of interest. You can apply this direct to response.text with no need for bs4.
The dictionary variable is called productObject, and you want the non-empty dictionary which is the second occurrence of productObject = {..} i.e. not the one which has 'var ' preceeding it. You can use negative lookbehind to specify this requirement.
Use hjson to handle property names enclosed in single quotes.
Py
import requests, re, hjson
r = requests.get('https://www.sophieparis.com/blacky-bag.html')
p = re.compile(r'(?<!var\s)productObject = ([\s\S]*?})')
data = hjson.loads(p.findall(r.text)[0])
print(data)
Regex: try
If you want to provide me with the webpage you wish to scrape the data from, I'll see if I can fix the code to pull the information.
Trying to scrape a table from multiple webpages and store in a list. The list prints out the results from the first webpage 3 times.
import pandas as pd
import requests
from bs4 import BeautifulSoup
dflist = []
for i in range(1,4):
s = requests.Session()
res = requests.get(r'http://www.ironman.com/triathlon/events/americas/ironman/world-championship/results.aspx?p=' + str(i) + 'race=worldchampionship&rd=20181013&agegroup=Pro&sex=M&y=2018&ps=20#axzz5VRWzxmt3')
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
dfs = pd.read_html(str(table))
dflist.append(dfs)
s.close()
print(dflist)
You left out the & after '?p=' + str(i), so your requests all have p set to ${NUMBER}race=worldchampionship, which ironman.com presumably can't make sense of and just ignores. Insert a & at the beginning of 'race=worldchampionship'.
To prevent this sort of mistake in the future, you can pass the URL's query parameters as a dict to the params keyword argument like so:
params = {
"p": i,
"race": "worldchampionship",
"rd": "20181013",
"agegroup": "Pro",
"sex": "M",
"y": "2018",
"ps": "20",
}
res = requests.get(r'http://www.ironman.com/triathlon/events/americas/ironman/world-championship/results.aspx#axzz5VRWzxmt3', params=params)
I am learning to use beautifulsoup and python to extract an html table. I tried using the following code to extract the balance sheet for Google. However, I can't seem to get all the rows scraped correctly.
I can't manage to omit rows that is just a spacer and I don't manage to extract rows of the Totals (eg. Total Asset).
Any advice? Advice on simplifying the code also valuable.
from bs4 import BeautifulSoup
import requests
def bs_extract(stock_ticker):
url= 'https://finance.yahoo.com/q/bs?s='+str(stock_ticker)+'&annual'
source_code = requests.get(url)
plain_text=source_code.text
soup = BeautifulSoup(plain_text)
c1= ""
c2= ""
c3= ""
c4= ""
c5= ""
table = soup.find("table", { "class" : "yfnc_tabledata1" })
# print (table)
for row in table.findAll("tr"):
cells = row.findAll("td")
if len(cells)==5:
c1=cells[0].find(text=True)
c2=cells[1].find(text=True)
c3=cells[2].find(text=True)
c4=cells[3].find(text=True)
c5=cells[4].find(text=True)
elif len(cells)==6:
c1=cells[1].find(text=True)
c2=cells[2].find(text=True)
c3=cells[3].find(text=True)
c4=cells[4].find(text=True)
c5=cells[5].find(text=True)
elif len(cells)==1:
c1=cells[0].find(text=True)
c2=""
c3=""
c4=""
c5=""
else:
pass
print(c1,c2,c3,c4,c5)
bs_extract('goog')
You might find it easier to get this data structured, through YQL. See http://goo.gl/qKeWXw