crawler update data to an array, yield inside a loop - python

I want to continuous crawl and update an array value using loop because I need to click some button to get next value on array.
However it seem yield inside loop works as parallel thread and the item was yield many time.
What I want is go through the loop, update data and yield item only one time.
example:
current output:
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data31']}
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data32']}
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data33']}
expected:
{'field1': 'data1',
'filed2' : 'data2',
'field3' : ['data31', 'data32', 'data3']}
Here is my code
def parse_individual_listings(self, response):
...
data = {}
data[field1] = 'data1'
data[field1] = 'data2'
...
for i in range(3):
yield scrapy.Request(
urlparse.urljoin(response.url, link['href']), #different link
callback=self.parse_individual_tabs,
meta={'data': data, 'n':i};
)
def parse_individual_tabs(self, response):
data = response.meta['data']
i = response.meta['i']
...
# keep populating `data`
data[field3][i] = "data3[i]" #this value change when I click a button to update
yield data

Try to use inline_requests library (https://pypi.org/project/scrapy-inline-requests/). It let's you to make requests inside the same function. It is useful to collect data to one object instead of yielding different. Check this example with some pseudocode:
from inline_requests import inline_requests
from scrapy import Selector
#inline_requests
def parse_individual_listings(self, response):
...
data = {}
data[field1] = 'data1'
data[field1] = 'data2'
...
for i in range(3):
extra_req = yield scrapy.Request(
response.urljoin(link['href']), #different link
)
# apply your logics here, say extract some data
sel = Selector(text=extra_req.text)
data['field3'].append(sel.css('some css selector').get())
yield data

Follow multi thread and unsynchronized idea, I end up using mutex lock in parse_individual_tabs and it seems work fine.
from threading import Lock
...
mutex = Lock()
count = 0
...
def parse_individual_tabs(self, response):
self.mutex.acquire(1)
try:
count += 1
data = response.meta['data']
i = response.meta['i']
...
# keep populating `data`
data[field3][i] = "data3[i]" #this value change when I click a button to update
finally:
self.mutex.release()
if (count == 3): #check if this is callback of last yield Request
yield data
else:
return

Related

Unable to iterate through a csv.DictReader object

I'm having trouble iterating through a DictReader object. I'm completely stumped because the object gets returned from another function with the same properties (size and fieldnames), but I cannot figure out for the life of me how to iterate through it. I know there's been some changes on the object type, so I'll mention that this is on Python 3.7. Is there another way to access data in a DictReader object?
Anywho, here's relevant code. I cleaned it up to post here, so hope it's not too confusing:
def schedchange():
data = otherfunction.data_get(referer_url=setup.url)
logging.info(type(data)) # <class 'csv.DictReader'>
import sys
logging.info(sys.getsizeof(data)) # logs a value of 56
logging.info(data.fieldnames) # logs the fieldnames properly
count = 0
for row in data:
count += 1
logging.info('aspen count: ')
logging.info(count) # logs a value of: 37337
return data
def main():
rosters = client.schedchange()
logger.info(rosters.fieldnames) # Prints/logs fieldname properly
import sys
logging.info(sys.getsizeof(rosters)) # Logs 56, the same size as in schedchange
logging.info(type(rosters)) # <class 'csv.DictReader'>
count = 0
for i in rosters: #I've also tried enumerate(rosters) here
count += 1
logger.info(count) #nothing gets logged
logger.info('end count: ')
logger.info(count) # Count never gets incremented so a 0 is logged here
def data_get(self, referer_url):
url = self.baseurl + 'quickReportMenu.do' + ';jessionid={}'.format(self.sessionid)
r1 = get_retry(self.s, url, headers={'Referer': referer_url},
params={'format': 'simple',
'extension': '0',
'deploymentId': 'did'})
url = self.baseurl + 'quickReportMenu.do'
self.payload['org.apache.struts.taglib.html.TOKEN'] = findstruts(r1.text)
self.payload['userEvent'] = '930'
self.payload['deploymentId'] = 'did'
self.payload['extension'] = '0'
p1 = post_retry(self.s, url, headers={'Referer': r1.url}, data=self.payload)
# Extract the url for the csv from the text of the response
popupid = findurl(p1.text)
# Get csv
url = self.baseurl + popupid
d1 = get_retry_simple(self.s, url, headers={'Referer': p1.url})
# Break text into iterable lines for each /n
d1.encoding = 'utf-8-sig'
iterdat = str.splitlines(d1.text)
# Parse csv
data = csv.DictReader(iterdat)
return data

Celery how to create a task group in a for loop

I need to create a celery group task where I want to wait for until it has finished, but the docs are not clear to me how to achieve this:
this is my current state:
def import_media(request):
keys = []
for obj in s3_resource.Bucket(env.str('S3_BUCKET')).objects.all():
if obj.key.endswith(('.m4v', '.mp4', '.m4a', '.mp3')):
keys.append(obj.key)
for key in keys:
url = s3_client.generate_presigned_url(
ClientMethod='get_object',
Params={'Bucket': env.str('S3_BUCKET'), 'Key': key},
ExpiresIn=86400,
)
if not Files.objects.filter(descriptor=strip_descriptor_url_scheme(url)).exists():
extract_descriptor.apply_async(kwargs={"descriptor": str(url)})
return None
Now I need to create a new task inside the group for every URL I have, how can I do that?
I Now managed to get my flow working like this:
#require_http_methods(("GET"))
def import_media(request):
keys = []
urls = []
for obj in s3_resource.Bucket(env.str('S3_BUCKET')).objects.all():
if obj.key.endswith(('.m4v', '.mp4', '.m4a', '.mp3')):
keys.append(obj.key)
for key in keys:
url = s3_client.generate_presigned_url(
ClientMethod='get_object',
Params={'Bucket': env.str('S3_BUCKET'), 'Key': key},
ExpiresIn=86400,
)
if not Files.objects.filter(descriptor=strip_descriptor_url_scheme(url)).exists():
new_file = Files.objects.create(descriptor=strip_descriptor_url_scheme(url))
new_file.save()
urls.append(url)
workflow = (
group([extract_descriptor.s(url) for url in urls]).delay()
)
workflow.get(timeout=None, interval=0.5)
print("hello - Further processing here")
return None
Any suggestions to optimize this? At least now its working nice!
Thanks in advance
https://docs.celeryproject.org/en/latest/userguide/canvas.html#groups
A group runs all tasks despite whether or not any fail, chaining runs the next task if the previous succeeds. What you could do is rather than call apply_async each time in the for loop. You can use the signature method which applys the args but doesnt execute the task, until your ready.
from celery import group
...
all_urls= []
for key in keys:
url = s3_client.generate_presigned_url(
ClientMethod='get_object',
Params={'Bucket': env.str('S3_BUCKET'), 'Key': key},
ExpiresIn=86400,
)
if not Files.objects.filter(descriptor=strip_descriptor_url_scheme(url)).exists():
all_urls.append(url)
g = group(extract_descriptor.s(kwargs={"descriptor": str(url)}) for url
in all_urls) # create group
result = g() # you may need to call g.apply_sync(), but this executes all tasks in group
result.ready() # have all subtasks completed?
result.successful() # were all subtasks successful?

How to append dictionary to csv without appending keys

I have a dictionary features = {'feature1' : 'hi', 'feature2': 'second feature', 'feature3': 'third feature'}. I need to save it to a csv file. But this dictionary gets renewed in each iteration and a new dictionary is appended to existing csv file. I am using it in scrapy.
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
# ('/category/', 'parse_category'),
]
def parse(self,response):
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
# print("hi here")
features = {}
features["ad_url"] = response.request.url
#filling feature dictionary
df = pd.DataFrame.from_dict(features , orient='index')
df = df.transpose()
df.to_csv("result.csv",mode = 'a', index = False)
The problem is that this saves dictionary to csv along with key as well. I am attaching the picture of excel sheet here:
enter image description here
Intuitively speaking header should be filled only once at the top and not every time in every other row. How do I do that?
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = ['https://www.arabam.com/sitemap/otomobil_1.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
# ('/category/', 'parse_category'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI':'FILEname.csv'}
def parse(self,response):
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self,response):
# print("hi here")
item = {}
item["ad_url"] = response.request.url
yield item
to run it scrapy crawl spidername

Loop and add function component as index

I would like to change the index of the following code. Instead of having 'close' as the index, I want to have the corresponding x from the function. As sometimes like in this example even if i provide 4 curr only 3 are available. Meaning that I cannot add the list as the index after looping as the size changes. Thank you for your help. I should add that even with the set_index(x) the index remain 'close'.
The function daily_price_historical retrieve prices from a public API . There are exactly 7 columns from which I select the the first one (close).
The function:
def daily_price_historical(symbol, comparison_symbol, all_data=False, limit=1, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
return df
The code:
curr = ['1WO', 'ABX','ADH', 'ALX']
d_price = []
for x in curr:
try:
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close).set_index(x)
except:
pass
d_price = pd.concat(d_price, axis=1)
d_price = d_price.transpose()
print(d_price)
The output:
0
close 2.6100
close 0.3360
close 0.4843
The function daily_price_historical returns a dataframe, so daily_price_historical(x, 'JPY', exchange='CCCAGG').close is a pandas Series. The title of a Series is its name, but you can change it with rename. So you want:
...
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close.rename(x))
...
In your original code, d_price.append(close).set_index(x) raised a AttributeError: 'NoneType' object has no attribute 'set_index' exception because append on a list returns None but the exception was raised after the append and was silently swallowed by the catchall except: pass.
What to remember from that: never use the very dangerous :
try:
...
except:
pass
which hides any error.
Try this small code
import pandas as pd
import requests
curr = ['1WO', 'ABX','ADH', 'ALX']
def daily_price_historical(symbol, comparison_symbol, all_data=False, limit=1, aggregate=1, exchange=''):
url = 'https://min-api.cryptocompare.com/data/histoday?fsym={}&tsym={}&limit={}&aggregate={}'\
.format(symbol.upper(), comparison_symbol.upper(), limit, aggregate)
if exchange:
url += '&e={}'.format(exchange)
if all_data:
url += '&allData=true'
page = requests.get(url)
data = page.json()['Data']
df = pd.DataFrame(data)
df.drop(df.index[-1], inplace=True)
return df
d_price = []
lables_ind = []
for idx, x in enumerate(curr):
try:
close = daily_price_historical(x, 'JPY', exchange='CCCAGG').close
d_price.append(close[0])
lables_ind.append(x)
except:
pass
d_price = pd.DataFrame(d_price,columns=["0"])
d_price.index = lables_ind
print(d_price)
Output
0
1WO 2.6100
ADH 0.3360
ALX 0.4843

Steam API grabbing a list of prices

I was trying to grab a list of prices. So far my code for such a thing is:
def steamlibrarypull(steamID, key):
#Pulls out a CSV of Steam appids.
steaminfo = {
'key': key,
'steamid': steamID,
'format':'JSON',
'include_appinfo':'1'
}
r = requests.get('http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/', params=steaminfo)
d = json.loads(r.content)
I = d['response']['games']
B = {}
for games in I:
B[games['name'].encode('utf8')] = games['appid']
with open('games.csv', 'w') as f:
for key, value in B.items():
f.write("%s,%s\r\n" % (key, value))
return B
But I'd like to be able to do a request.get that'll take this dictionary and ouput out a list of prices. https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI Seems to require the need of a CSV list but is that really necessary?
this is a non formal steam api meaning steam modifies as they see fit. currently it does not support multiple appids as noted here.
to use it to get the price of a game you would go
http://store.steampowered.com/api/appdetails/?appids=237110&cc=us&filters=price_overview
working from the code you have above you will need to know how to iterate through the dictionary and update the store price once you get it back.
def steamlibrarypull(steamID, key):
#Pulls out a CSV of Steam appids.
steaminfo = {
'key': key,
'steamid': steamID,
'format':'JSON',
'include_appinfo':'1'
}
r = requests.get('http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/', params=steaminfo)
d = json.loads(r.content)
response = d['response']['games']
games = {}
for game in response:
getprice = requests.get('http://store.steampowered.com/api/appdetails/?appids=%d&filters=price_overview&cc=us' % game['appid'])
if getprice.status_code == 200:
rjson = json.loads(getprice.text)
# use the appid to fetch the value and convert to decimal
# appid is numeric, cast to string to lookup the price
try:
price = rjson[str(game['appid'])]['data']['price_overview']['initial'] * .01
except:
price = 0
games[game['name']] = {'price': price, 'appid': game['appid']}
this will return the following dictionary:
{u'Half-Life 2: Episode Two': {'price': 7.99, 'appid': 420}
it would be easier to navigate via appid instead of name but as per your request and original structure this is how it should be done. this then gives you the name, appid and price that you can work with further or write to a file.
note that this does not include a sleep timer, if your list of games is long you should sleep your api calls for 2 seconds before making another one or the api will block you and will not return data which will cause an error in python when you parse the price.

Categories