I have a problem with passing a variable from a method to method within a given class.
The code is this (I am practicing beginner):
class Calendar():
def __init__(self,link):
self.link = link
self.request = requests.get(link)
self.request.encoding='UTF-8'
self.soup = BeautifulSoup(self.request.text,'lxml')
def DaysMonth(self):
Dates = []
tds = self.soup.findAll('td', {'class':'action'})
for td in tds:
check = (td.findAll('a')[0].text)
if "Víkendová odstávka" in check:
date = td.findAll('span')[0].text
Dates.append(date)
return Dates
def PrintCal(self):
return ['Víkendová odstávka serverů nastane ' + date + '. den v měsíci.' for date in Dates]
def main(self):
PrintCal(DaysMonth())
I would like to pass the list Dates from the method DaysMonth to the method PrintCal. When I initiate the class, i.e. cal = Calendar('link'), and run cal.PrinCal(), I get that the name Dates has not been defined. If I run cal.DaysMonth(), the output is as expected.
What is the issue here? Thank you!
Dates is a local variable in the DaysMonth method, and is therefore not visible anywhere else. Fortunately, DaysMonth does return Dates, so it's easy to get the value you want. Simply add the following line to your PrintCal method (before the return statement):
Dates = self.DaysMonth()
You are trying to do too much in the Calendar object, particularly in the init method. You don't want to combine the scraping and parsing of a website at the time the object is instantiated. I would use the Calendar object to store and display the results of the scraping/parsing. If you need everything to be object-oriented, than create a separate Scraper/Parser class that handles that part of the logic.
class Calendar():
def __init__(self, dates):
self.dates = dates
def display_dates(self):
return ['Víkendová odstávka serverů nastane ' + date + '. den v měsíci.'
for date in self.dates]
r = requests.get(link, encoding='UTF-8')
soup = BeautifulSoup(r.text,'lxml')
dates = []
for td in soup.findAll('td', {'class':'action'}):
check = (td.findAll('a')[0].text)
if "Víkendová odstávka" in check:
dates.append(td.findAll('span')[0].text)
c = Calendar(dates=dates)
print(c.display_dates)
Related
I'm trying to set up a loop to pull in weather data for about 500 weather stations for an entire year which I have in my dataframe. The base URL stays the same, and the only part that changes is the weather station ID.
I'd like to create a dataframe with the results. I believe i'd use requests.get to pull in data for all the weather stations in my list, which the IDs to use in the URL are in a column called "API ID" in my dataframe. I am a python beginner - so any help would be appreciated! My code is below but doesn't work and returns an error:
"InvalidSchema: No connection adapters were found for '0 " http://www.ncei.noaa.gov/access/services/data/...\nName: API ID, Length: 497, dtype: object'
.
def callAPI(API_id):
for IDs in range(len(API_id)):
url = ('http://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&dataTypes=PRCP,SNOW,TMAX,TMIN&stations=' + distances['API ID'] + '&startDate=2020-01-01&endDate=2020-12-31&includeAttributes=0&includeStationName=true&units=standard&format=json')
r = requests.request('GET', url)
d = r.json()
ll = []
for index1,rows1 in distances.iterrows():
station = rows1['Closest Station']
API_id = rows1['API ID']
data = callAPI(API_id)
ll.append([(data)])
I am not sure about your whole code base, but this is the function that will return the data from the API, If you have multiple station id on a single df column then you can use a for loop otherwise no need to do that.
Also, you are not returning the result from the function. Check the return keyword at the end of the function.
Working code:
import requests
def callAPI(API_id):
url = ('http://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&dataTypes=PRCP,SNOW,TMAX,TMIN&stations=' + API_id + '&startDate=2020-01-01&endDate=2020-12-31&includeAttributes=0&includeStationName=true&units=standard&format=json')
r = requests.request('GET', url)
d = r.json()
return d
print(callAPI('USC00457180'))
So your full code will be something like this,
def callAPI(API_id):
url = ('http://www.ncei.noaa.gov/access/services/data/v1?dataset=daily-summaries&dataTypes=PRCP,SNOW,TMAX,TMIN&stations=' + API_id + '&startDate=2020-01-01&endDate=2020-12-31&includeAttributes=0&includeStationName=true&units=standard&format=json')
r = requests.request('GET', url)
d = r.json()
return d
ll = []
for index1,rows1 in distances.iterrows():
station = rows1['Closest Station']
API_id = rows1['API ID']
data = callAPI(API_id)
ll.append([(data)])
Note: Even better use asynchronous calls to the API to make the process faster. Something like this: https://stackoverflow.com/a/56926297/1138192
I'm creating a web scraper that will be used to value stocks. The problem I got is that my code returns a object "placement" (Not sure what it should be called) instead of the value.
import requests
class Guru():
MedianPE = 0.0
def __init__(self, ticket):
self.ticket = ticket
try:
url = ("https://www.gurufocus.com/term/pettm/"+ticket+"/PE-Ratio-TTM/")
response = requests.get(url)
htmlText = response.text
firstSplit = htmlText
secondSplit = firstSplit.split("And the <strong>median</strong> was <strong>")[1]
thirdSplit = secondSplit.split("</strong>")[0]
lastSplit = float(thirdSplit)
try:
Guru.MedianPE = lastSplit
except:
print(ticket + ": Median PE N/A")
except:
print(ticket + ": Median PE N/A")
def getMedianPE(self):
return float(Guru.getMedianPE)
g1 = Guru("AAPL")
g1.getMedianPE
print("Median + " + str(g1))
If I print the lastSplit inside the __init__ it returns the value I want 15.53 but when I try to get it by the function getMedianPE I just get Median + <__main__.Guru object at 0x0000016B0760D288>
Thanks a lot for your time!
Looks like you are trying to cast a function object to a float. Simply change return float(Guru.getMedianPE) to return float(Guru.MedianPE)
getMedianPE is a function (also called object method when part of a class), so you need to call it with parentheses. If you call it without parentheses, you get the method/function itself rather than the result of calling the method/function.
The other problem is that getMedianPE returns the function Guru.getMedianPE rather than the value Guru.MedianPE. I don't think you want MedianPE to be a class variable - you probably just want to set it as a default of 0 in init so that each object has its own median_PE value.
Also, it is not a good idea to include all of the scraping code in your init method. That should be moved to a scrape() method (or some other name) that you call after instantiating the object.
Finally, if you are going to print an object, it is useful to have a str method, so I added a basic one here.
So putting all of those comments together, here is a recommended refactor of your code.
import requests
class Guru():
def __init__(self, ticket, median_PE=0):
self.ticket = ticket
self.median_PE = median_PE
def __str__(self):
return f'{self.ticket} {self.median_PE}'
def scrape(self):
try:
url = f"https://www.gurufocus.com/term/pettm/{self.ticket}/PE-Ratio-TTM/"
response = requests.get(url)
htmlText = response.text
firstSplit = htmlText
secondSplit = firstSplit.split("And the <strong>median</strong> was <strong>")[1]
thirdSplit = secondSplit.split("</strong>")[0]
lastSplit = float(thirdSplit)
self.median_PE = lastSplit
except ValueError:
print(f"{self.ticket}: Median PE N/A")
Then you run the code
>>>g1 = Guru("AAPL")
...g1.scrape()
...print(g1)
AAPL 15.53
Trying to pass a list from one function to another (teamurls), which then in turn I will use in another program. I have my program working where I have a value output using yield. (yield full_team_urls)
How do I pass the list from the first function into the second (the def - team_urls) Also is that still possible to return the list and continue using yield aswell?
Can each function only return or output one object?
Edit: Tried to pass teamurls into the second function as shown below and I get the error - TypeError: team_urls() missing 1 required positional argument: 'teamurls'>
def table():
url = 'https://www.skysports.com/premier-league-table'
base_url = 'https://www.skysports.com'
today = str(date.today())
premier_r = requests.get(url)
print(premier_r.status_code)
premier_soup = BeautifulSoup(premier_r.text, 'html.parser')
headers = "Position, Team, Pl, W, D, L, F, A, GD, Pts\n"
premier_soup_tr = premier_soup.find_all('tr', {'class': 'standing-table__row'})
premier_soup_th = premier_soup.find_all('thead')
f = open('premier_league_table.csv', 'w')
f.write("Table as of {}\n".format (today))
f.write(headers)
premier_soup_tr = premier_soup.find_all('tr', {'class': 'standing-table__row'})
result = [[r.text.strip() for r in td.find_all('td', {'class': 'standing-table__cell'})][:-1] for td in premier_soup_tr[1:]]
teamurls = ([a.find("a",href=True)["href"] for a in premier_soup_tr[1:]])
return teamurls
for item in result:
f.write(",".join(item))
f.write("\n")
f.close()
print('\n Premier league teams full urls:\n')
for item in teamurls:
entire_team = []
# full_team_urls.append(base_url+ item)
full_team_urls = (base_url + item + '-squad')
yield full_team_urls
table()
def team_urls(teamurls):
teams = [i.strip('/') for i in teamurls]
print (teams)
team_urls()
To pass a value into a method, use arguments:
team_urls(teamurls)
You'll need to specify that argument at the definition of team_urls as such:
def team_urls(teamurls):
So I've created a class...
class Dept_member:
quarterly_budget = 0
outside_services = 0
regular_count = 0
contractor_count = 0
gds_function = ''
dept_name = ''
def __init__(self, quarterly_budget, outside_services, dept_name):
self.quarterly_budget = quarterly_budget
self.outside_services = outside_services
self.dept_name = dept_name
def regular_cost(self):
print "%s" % str((self.quarterly_budget - self.outside_services) / self.regular_count)
def contractor_cost(self):
print "%s" % str(self.outside_services / self.contractor_count)
Now I want to use variables I collect while iterating over an excel file to create objects for each row using the class detailed above.
for row in range(6,d_sh.get_highest_row()):
if f_sh.cell(row=row, column=2).value:
deptno = f_sh.cell(row=row, column=2).value
q_budget = f_sh.cell(row=row, column=17).value #Q3 Actual
os_budget = f_sh.cell(row=row, column=14).value
deptnode = f_sh.cell(row=row, column=1).value
chop = deptnode.split(" ")
deptname = " ".join(chop[1:])
Dept = "gds_"+str(deptno) ### This is what I want my new object to be called!
Dept = Dept_member(q_budget, os_budget, deptname)
Below are some output from an idle interactive session after this runs.
>>>
>>> deptno
u'180024446'
>>> q_budget
59412.00978792818
>>> os_budget
9973.898858075034
>>> deptnode
u'M180024446 GDS Common HW FEP China'
>>> deptname
u'GDS Common HW FEP China'
>>> Dept
<__main__.Dept_member instance at 0x126c32050>
>>> Dept.quarterly_budget
59412.00978792818
What I really wanted was an object named gds_180024446 but instead it mutated the variable.
Is it possible to create a bunch of objects using variables in a loop?
You should probably use python dictionaries (tutorial page describing dictionaries), instead of creating bunch of variables using eval function:
Dept["gds_"+str(deptno)] = Dept_member(q_budget, os_budget, deptname)
After that, you can fetch your object from dictionary with:
Dept['gds_180024446']
I think you need to use the eval function like
eval("gds_" + str(deptno) + " = Dept_member(q_budget, os_budget, deptname)")
I am trying to scrape a site that returns its data via Javascript. The code I wrote using BeautifulSoup works pretty well, but at random points during scraping I get the following error:
Traceback (most recent call last):
File "scraper.py", line 48, in <module>
accessible = accessible[0].contents[0]
IndexError: list index out of range
Sometimes I can scrape 4 urls, sometimes 15, but at some point the script eventually fails and gives me the above error. I can find no pattern behind the failing, so I'm really at a loss here - what am I doing wrong?
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
countries = csv.reader(open("countries.csv", 'rb'), delimiter=",")
database = csv.writer(open("herdict_database.csv", 'w'), delimiter=',')
basepage = "https://www.herdict.org/explore/"
session_id = "indepth;jsessionid=C1D2073B637EBAE4DE36185564156382"
ccode = "#fc=IN"
end_date = "&fed=12/31/"
start_date = "&fsd=01/01/"
year_range = range(2009, 2011)
years = [str(year) for year in year_range]
def get_number(var):
number = re.findall("(\d+)", var)
if len(number) > 1:
thing = number[0] + number[1]
else:
thing = number[0]
return thing
def create_link(basepage, session_id, ccode, end_date, start_date, year):
link = basepage + session_id + ccode + end_date + year + start_date + year
return link
for ccode, name in countries:
for year in years:
link = create_link(basepage, session_id, ccode, end_date, start_date, year)
print link
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
accessible = soup.find_all("em", class_="accessible")
inaccessible = soup.find_all("em", class_="inaccessible")
accessible = accessible[0].contents[0]
inaccessible = inaccessible[0].contents[0]
acc_num = get_number(accessible)
inacc_num = get_number(inaccessible)
print acc_num
print inacc_num
database.writerow([name]+[year]+[acc_num]+[inacc_num])
time.sleep(2)
You need to add error-handling to your code. When scraping a lot of websites, some will be malformed, or somehow broken. When that happens, you'll be trying to manipulate empty objects.
Look through the code, find all assumptions where you're assuming it works, and check against errors.
For that specific case, I would do this:
if not inaccessible or not accessible:
# malformed page
continue
soup.find_all("em", class_="accessible") is probably returning an empty list. You can try:
if accessible:
accessible = accessible[0].contents[0]
or more generally:
if accessibe and inaccesible:
accessible = accessible[0].contents[0]
inaccessible = inaccessible[0].contents[0]
else:
print 'Something went wrong!'
continue