Trying to get the average from values returned in a for loop and print it next to group name. I can get it to calculate the average and print it under the return but not next to the first print as in Group 2...
When I add the argument as in Group 2, I get this error:
print "Group 2 - %s%%" % (avg)
NameError: name 'avg' is not defined
Not sure what I'm doing wrong, please help. If anyone has an easier way to do this please feel free to suggest.
The end result I'm looking for should be as follows:
Group 1 - 100%
name1 100
name2 100
name3 100
Group 2 - 100%
name1 100
name2 100
name3 100
Here is my script so far:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import json
import pycurl
import time
from io import BytesIO
# Which monitors should data be retrieved for?
grp1 = ['000000000000000000', '000000000000000000', '000000000000000000']
grp2 = ['000000000000000000', '000000000000000000', '000000000000000000']
# Make calls to get the availability details
def connectMethod(method, url):
c = pycurl.Curl()
connectReturn = BytesIO()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER, ["Authorization: authtoken 00000000000000000000000000000000"])
c.setopt(c.WRITEFUNCTION, connectReturn.write)
c.setopt (pycurl.CUSTOMREQUEST, method)
c.perform()
c.close()
connectOutput = connectReturn.getvalue()
return connectOutput
returned_items = {}
avail = list()
print "Group 1"
for item in group1:
base_url = 'https://www.domain.com/api/reports/summary/'
putData = item + '?period=13&unit_of_time=1'
req_url = base_url + putData
listOfAvail = json.loads(connectMethod('GET', req_url))
returned_items[item] = listOfAvail
name = listOfAvail['data']['info']['resource_name']
avail_pct = listOfAvail['data']['summary_details']['availability_percentage']
avg_avail = avail.append(avail_pct)
print "%s\t%s%%" % (name, avail_pct)
avg = float(sum(avail))/len(avail)
print avg
print "Group 2 - %s%%" % (avg)
for item in group2:
base_url = 'https://www.domain.com/api/reports/summary/'
putData = item + '?period=13&unit_of_time=1'
req_url = base_url + putData
listOfAvail = json.loads(connectMethod('GET', req_url))
returned_items[item] = listOfAvail
name = listOfAvail['data']['info']['resource_name']
avail_pct = listOfAvail['data']['summary_details']['availability_percentage']
avg_avail = avail.append(avail_pct)
print "%s\t%s%%" % (name, avail_pct)
avg = float(sum(avail))/len(avail)
print avg
Quick, dirty, and ugly, but it should point you in the right direction to get it figured out and cleaned up a bit.
I just modified your code to output what you're asking for.
(I hard coded the names and percentages where you're invoking the web call)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import json
import pycurl
import time
from io import BytesIO
# Which monitors should data be retrieved for?
group1 = ['000000000000000000', '000000000000000000', '000000000000000000']
group2 = ['000000000000000000', '000000000000000000', '000000000000000000']
# Make calls to get the availability details
def connectMethod(method, url):
c = pycurl.Curl()
connectReturn = BytesIO()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.HTTPHEADER, ["Authorization: authtoken 00000000000000000000000000000000"])
c.setopt(c.WRITEFUNCTION, connectReturn.write)
c.setopt (pycurl.CUSTOMREQUEST, method)
c.perform()
c.close()
connectOutput = connectReturn.getvalue()
return connectOutput
def pprint(title, names, averages):
avg = float(sum(percentages))/len(percentages)
print "%s - %s%%" % (title, avg)
for i in range(len(names)):
print "%s\t%s" % (names[i], percentages[i])
returned_items = {}
percentages = []
names = []
for item in group1:
base_url = 'https://www.domain.com/api/reports/summary/'
putData = item + '?period=13&unit_of_time=1'
req_url = base_url + putData
#listOfAvail = json.loads(connectMethod('GET', req_url))
returned_items[item] = {"name": "item 1" } #listOfAvail
name = "blah" # listOfAvail['data']['info']['resource_name']
avail_pct = 60 # listOfAvail['data']['summary_details']['availability_percentage']
names.append(name)
percentages.append(avail_pct)
pprint("Group 1", names, percentages)
print("")
del percentages[:]
del names[:]
for item in group2:
base_url = 'https://www.domain.com/api/reports/summary/'
putData = item + '?period=13&unit_of_time=1'
req_url = base_url + putData
#listOfAvail = json.loads(connectMethod('GET', req_url))
returned_items[item] = {"name": "item 2" } #listOfAvail
name = "asdf" #listOfAvail['data']['info']['resource_name']
avail_pct = 87 #listOfAvail['data']['summary_details']['availability_percentage']
names.append(name)
percentages.append(avail_pct)
pprint("Group 1", names, percentages)
This outputs:
Group 1 - 60.0%
blah 60
blah 60
blah 60
Group 1 - 87.0%
asdf 87
asdf 87
asdf 87
Essentially you just need to perform all your calculations first, and then print the results.
Some thoughts to help you clean this up a bit
Code Reuse:
The code in the for loops is nearly identical. Refactor this into a function where you pass the group, url, etc. (does this make sense)?
Have a look into the requests library, it could condense your web call code quite a bit.
Related
I am new to stacker flow and this is my first post, so I hope I can explain myself well and you can help me! Thanks in advance for your help!!
I am using Scrapy to web scrape a popular real statement website from my native country. I am doing well with all the characteristics I want, such as Price, Surface, Bedrooms, among others. But I haven't been able to get the latitude/longitude of a property. In the website, for example, https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675, you can find a google map location as the image shows, and inside of this HTML element, it is possible to get the lat/long (highlighted in blue) but when I try to reach this element in my code, the spider doesn't recognize it.
Using this css selector crs_location = response.css('div.map-container img:nth-child(1)').getall() I am able to get the first img inside the div, getting the following output https://http2.mlstatic.com/resources/frontend/web-vip/ui-dist/images/pin-real-estate-d1ebb73e65.svg, but when I change the nth-child to: crs_location = response.css('div.map-container img:nth-child(2)').getall() to get the second child (what I want), the crs_location variable outcome empty.
I appreciate it if you can help to figure out how to get the lat/long of this.
Thanks!
HTML elements
Complete Code:
import scrapy
from scrapy import Selector
import requests
import pandas as pd
import numpy as np
# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess
# Create the Spider class
class Spider_Inmob(scrapy.Spider):
name = 'spider_inmob'
#download_delay = 3
# start_requests method
def start_requests( self ):
headers= {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
i=1
page=0
for y in range(1):
url = 'http://portalinmobiliario.com/venta/departamento/propiedades-usadas/providencia-metropolitana/_Desde_' + str(page)
print("----------PRUEBA1--------------" + str(page))
page = 51 + 50*i
i+=1
yield scrapy.Request(url = url, callback=self.parse, headers=headers)
def parse(self, response):
global aux3
links_busqueda = response.css('ol.ui-search-layout > li.ui-search-layout__item a.ui-search-result__content.ui-search-link::attr(href)').getall()
print(len(links_busqueda))
for url in links_busqueda:
aux3 = aux3+1
print(aux3)
yield response.follow(url=url, callback = self.parse_propiedad, meta={'dont_redirect': True, 'handle_httpstatus_list':[302]})
def parse_propiedad(self,response):
global aux2
aux2 = aux2+1
global crs_Bedroom, crs_Currency, crs_Link, crs_Parking, crs_Price, crs_Restroom, crs_Storage, crs_Total_Surface, crs_Useful_Surface, crs_location
#print ("Number iteration " + str(aux2))
global Nombre_variables
#print('-------------------------PRUEBAAAA------1---------------')
aux=1
crs_prueba = response.css('header.item-title > h1.item-title__primary::text').getall()
#print(crs_prueba)
#This for goes over each characteristic for property like, total surface, bedrooms, bathrooms, etc
for i in range(20):
variable = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > strong::text').getall()
variable2 = response.css('section.specs-container > ul.specs-list li.specs-item:nth-child('+ str(i) +') > span::text').getall()
np_variable = np.array(variable)
if not variable:
a=0
else:
for var in Nombre_variables:
if np_variable[0] == "Superficie total":
crs_Total_Surface = variable2
elif np_variable[0] == "Superficie útil":
crs_Useful_Surface = variable2
elif np_variable[0] == "Dormitorios":
crs_Bedroom = variable2
elif np_variable[0] == "Baños":
crs_Restroom = variable2
elif np_variable[0] == "Estacionamientos":
crs_Parking = variable2
elif np_variable[0] == "Bodegas":
crs_Storage = variable2
# print(crs_Storage)
#print("----------------PRUEBA--------------2--------------------")
crs_Link = response.url
crs_location = response.css('div.map-container img:nth-child(2)').getall()
print("/n/n/n")
print(crs_location)
print("/n/n/n")
# Ass we have two kind of currency, we transform everything to UF currency
variable3 = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
np_variable3 = np.array(variable3)
# print(np_variable3[0])
if np_variable3[0] != "UF":
crs_Currency = "$"
variable4 = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
variable4= str(variable4).strip("['']")
# print(variable4)
variable4= str(variable4).replace(".","")
# print(variable4)
# print(type(variable4))
np_variable4 = np.array(variable4)
variable4 = float(variable4)
# print(variable4)
crs_Price = round(variable4/28500,0)
else:
crs_Currency = response.css('fieldset.item-price span.price-tag > span.price-tag-symbol::text').getall()
crs_Price = response.css('fieldset.item-price span.price-tag > span.price-tag-fraction::text').getall()
df2 = {'Link':[crs_Link],
'Currency':[crs_Currency],
'Price':[crs_Price],
'Total Surface':[crs_Total_Surface],
'Useful Surface':[crs_Useful_Surface],
'Location':[crs_location],
'Bedroom':[crs_Bedroom],
'Restroom':[crs_Restroom],
'Parking':[crs_Parking],
'Storage':[crs_Storage]}
# print(df2)
# print('-------------------------PRUEBAAAA---------------')
global df3
df3 = df3.append(df2, ignore_index=True)
#print(df3.head())
#Name of variables to take in consideration
Nombre_variables =["Superficie total", "Superficie útil", "Dormitorios", "Baños", "Estacionamientos", "Bodegas"]
Dict_Nombre_variables = {}
#initialize DataFrame
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df_data = pd.DataFrame(columns=headers)
headers = ["Link","Currency", "Price", "Total Surface","Useful Surface", "Location", "Bedroom", "Restroom", "Parking", "Storage"]
df3 = pd.DataFrame(columns=headers)
#Initialize global variables used in methods
aux2=0
crs_Link=0
crs_Currency=0
crs_Price=0
crs_Total_Surface=0
crs_Useful_Surface=0
crs_location=0
crs_Bedroom=0
crs_Restroom=0
crs_Parking=0
crs_Storage =0
aux3=0
# Run the Spider
process = CrawlerProcess({'USER_AGENT': 'hol'})
process.crawl(Spider_Inmob)
process.start()
path = "D:\\0. Documentos\\7. DataCamp\\1. WebScraping\\99. Ejemplos\\PortalInmob.csv"
df3.to_csv(path)
print(df3.head())
print(df3)
print(df3['Location'])
Pretty trivial with requests and regex since we know it's the only lat/lon on the page, and we know the url format. We can capture the lat/lon portion of the url using regex and split it apart.
import requests
import re
url = 'https://www.portalinmobiliario.com/MLC-564988630-estilo-mariposa-_JM#position=2&type=item&tracking_id=ed337e69-9999-4ede-b393-ef378e1a5675'
r = requests.get(url).text
lat, lon = re.findall(r'center=(-?\d+\.\d+\%2C-?\d+\.\d+)',r)[0].split('%2C')
I want to capture the sensor data through thingspeak.
I used the url provided with the api key in the browser:
http://api.thingspeak.com/update?key=MYKEY&field1=25&field2=75
I expect it will return field1 and field2, but the result below shows only the value of field1.
"channel":{
"id":202242,
"name":"DHT11",
"latitude":"0.0",
"longitude":"0.0",
"field1":"Temperature ( degC ) 1",
"field2":"Humidity ( % )",
"created_at":"2016-12-11T17:16:21Z",
"updated_at":"2016-12-11T18:12:00Z",
"last_entry_id":12
},
"feeds":[
{
"created_at":"2016-12-11T18:12:00Z",
"entry_id":12,
"field1":25
}
]
What step have I missed?
Try this approach:
Here you make request using APIs. You will find various API requests here.
import urllib2
import json
import time
READ_API_KEY=' '
CHANNEL_ID= ' '
while True:
TS = urllib2.urlopen("http://api.thingspeak.com/channels/%s/feeds/last.json?api_key=%s" \
% (CHANNEL_ID,READ_API_KEY))
response = TS.read()
data=json.loads(response)
a = data['created_at']
b = data['field1']
c = data['field2']
d = data['field3']
print a + " " + b + " " + c + " " + d
time.sleep(5)
TS.close()
Through this code I've update a bunch of rows in Google Spreadsheet.
The request goes well and returns me the updatedRange below.
result = service.spreadsheets().values().append(
spreadsheetId=spreadsheetId,
range=rangeName,
valueInputOption="RAW",
insertDataOption="INSERT_ROWS",
body=body
).execute()
print(result)
print("Range updated")
updateRange = result['updates']['updatedRange']
Now I would like to do a batchUpdate request to set the formatting or set a protected range, but those API require a range specified as startRowIndex, endRowIndex and so on.
How could I retrieve the rows index from the updatedRange?
Waiting for a native or better answer, I'll post a function I've created to translate a namedRange into a gridRange.
The function is far from perfect and does not translate the sheet name to a sheet id (I left that task to another specific function), but accept named ranges in the form:
sheet!A:B
sheet!A1:B
sheet!A:B5
sheet!A1:B5
Here is the code
import re
def namedRange2Grid(self, rangeName):
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
match = re.match(".*?\!([A-Z0-9]+)\:([A-Z0-9]+)", rangeName)
if match:
start = match.group(1)
end = match.group(2)
matchStart = re.match("([A-Z]{1,})([1-9]+){0,}", start)
matchEnd = re.match("([A-Z]{1,})([1-9]+){0,}", end)
if matchStart and matchEnd:
GridRange = {}
letterStart = matchStart.group(1)
letterEnd = matchEnd.group(1)
if matchStart.group(2):
numberStart = int(matchStart.group(2))
GridRange['startRowIndex'] = numberStart - 1
if matchEnd.group(2):
numberEnd = int(matchEnd.group(2))
GridRange['endRowIndex'] = numberEnd
i = 0
for l in range(0, len(letterStart)):
i = i + (l * len(ascii_uppercase))
i = i + ascii_uppercase.index(letterStart[l])
GridRange['startColumnIndex'] = i
i = 0
for l in range(0, len(letterEnd)):
i = i + (l * len(ascii_uppercase))
i = i + ascii_uppercase.index(letterEnd[l])
GridRange['endColumnIndex'] = i + 1
return GridRange
I got a CSV file with numbers and I want to insert these numbers into a specific location in an url : jus after " "value": "
Here is my code :
with open('update_cases_id.csv') as p:
for lines in p:
uuid = lines.rstrip()
url_POST = "www.example.com/"
values = {}
values['return_type'] = 'retrieval'
values['format'] = 'TSV'
values['size'] = '70'
values['filters'] = '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value": .format(uuid)}}]}'
data = urllib.urlencode(values)
url_final = url_POST + '?' + data
req2 = urllib2.Request(url_final)
req2.add_header('cookie', cookie)
handle = urllib2.urlopen(req2)
( edited :
example input : 123456-123456-987654
example output : it s data text )
You can do this with string formatting, this should work for you:
# ...snip
values['filters'] = '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":%s}]}' % uuid
# snip...
The %s will be replaced by the uuid by the % replacement operator:
>>> values = {}
>>> uuid = 1234
>>> values['filters'] = '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":%s}]}' % uuid
>>> values
{'filters': '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":1234}]}'}
Try to use Template.
from string import Template
params = Template('{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value": ${your_value}}}]}')
params = params.safe_substitute(your_value=123)
# params is '{"op":"and","content":[{"op":"in","content":{"field":"cases.case_id","value":123}]}'
I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))