Xpath field repetition - python

I want to put it in data,And I'm going to insert MySQL,But the resulting content field is repeated,I think it's my for circular logic error,So how do I change?
def get_user_data(self,start_url):
html = requests.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
contents = selector.xpath('//span[#class="ctt"]/text()')
times = selector.xpath('//span[#class="ct"]/text()')
data = {}
for each_text in contents:
data['content'] = each_text
for each_time in times:
month_day, time, device = each_time.split(maxsplit=2)
data['mobile_phone'] = device
data['create_time'] = month_day + '\n' + time
data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.mysql.insert(data)

Related

Unable to iterate through a csv.DictReader object

I'm having trouble iterating through a DictReader object. I'm completely stumped because the object gets returned from another function with the same properties (size and fieldnames), but I cannot figure out for the life of me how to iterate through it. I know there's been some changes on the object type, so I'll mention that this is on Python 3.7. Is there another way to access data in a DictReader object?
Anywho, here's relevant code. I cleaned it up to post here, so hope it's not too confusing:
def schedchange():
data = otherfunction.data_get(referer_url=setup.url)
logging.info(type(data)) # <class 'csv.DictReader'>
import sys
logging.info(sys.getsizeof(data)) # logs a value of 56
logging.info(data.fieldnames) # logs the fieldnames properly
count = 0
for row in data:
count += 1
logging.info('aspen count: ')
logging.info(count) # logs a value of: 37337
return data
def main():
rosters = client.schedchange()
logger.info(rosters.fieldnames) # Prints/logs fieldname properly
import sys
logging.info(sys.getsizeof(rosters)) # Logs 56, the same size as in schedchange
logging.info(type(rosters)) # <class 'csv.DictReader'>
count = 0
for i in rosters: #I've also tried enumerate(rosters) here
count += 1
logger.info(count) #nothing gets logged
logger.info('end count: ')
logger.info(count) # Count never gets incremented so a 0 is logged here
def data_get(self, referer_url):
url = self.baseurl + 'quickReportMenu.do' + ';jessionid={}'.format(self.sessionid)
r1 = get_retry(self.s, url, headers={'Referer': referer_url},
params={'format': 'simple',
'extension': '0',
'deploymentId': 'did'})
url = self.baseurl + 'quickReportMenu.do'
self.payload['org.apache.struts.taglib.html.TOKEN'] = findstruts(r1.text)
self.payload['userEvent'] = '930'
self.payload['deploymentId'] = 'did'
self.payload['extension'] = '0'
p1 = post_retry(self.s, url, headers={'Referer': r1.url}, data=self.payload)
# Extract the url for the csv from the text of the response
popupid = findurl(p1.text)
# Get csv
url = self.baseurl + popupid
d1 = get_retry_simple(self.s, url, headers={'Referer': p1.url})
# Break text into iterable lines for each /n
d1.encoding = 'utf-8-sig'
iterdat = str.splitlines(d1.text)
# Parse csv
data = csv.DictReader(iterdat)
return data

divide list of elements in scrapy output into seperate rows

I am trying to separate the output from Scrapy into separate lines in an Excel file but I get something like this
In other words each output from variant id, price and name should be in placed in seperate lines in Excel.
I am using scrapy-xlsx 0.1.1 library to export output to xlsx file (it cannot be in csv).
Please tell me where is the issue.
import scrapy
from ..items import ZooplusItem
import re
class ZooplusDeSpider(scrapy.Spider):
name = 'zooplus_de'
allowed_domains = ['zooplus.de']
start_urls = ['https://www.zooplus.de/shop/hunde/hundefutter_trockenfutter/diaetfutter']
def parse(self, response):
for link in response.css('.MuiGrid-root.MuiGrid-container.MuiGrid-spacing-xs-2.MuiGrid-justify-xs-flex-end'):
items = ZooplusItem()
redirect_urls = response.request.meta.get('redirect_urls')
items['url'] = link.redirect_urls[0] if redirect_urls else response.request.url
items['product_url'] = link.css('.MuiGrid-root.product-image a::attr(href)').getall()
items['title'] = link.css('h3 a::text').getall()
items['id'] = link.css('h3 a::attr(id)').getall()
items['review'] = link.css('span.sc-fzoaKM.kVcaXm::text').getall()
items['review'] = re.sub(r'\D', " ", str(items['review']))
items['review'] = items['review'].replace(" ", "")
#items['review'] = int(items['review'])
items['rate'] = len(link.css('a.v3-link i[role=full-star]'))
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield items
If you want to store all the variants with common information duplicated, then you need to loop through each variant and yield that separately. You can copy the common information you've already collected and add to that.
In summary replace
items['variant_id'] = [i.strip().split('/n') for i in link.css('.jss114.jss115::text').extract()]
items['variant_name'] = [i.strip().split('/n') for i in link.css('.sc-fzqARJ.cHdpSy:not(.jss114.jss115)::text').extract()]
items['variant_price'] = [i.strip().split('/n') for i in link.css('div.product__prices_col meta::attr(content)').extract()]
yield item
with something like
for i in link.css("[data-zta='product-variant']"):
variant = items.copy()
variant["variant_id"] = i.attrib["data-variant-id"]
variant["variant_name"] = "".join(i.css(".title > div::text").getall()).strip()
variant['variant_price'] = i.css("[itemprop='price']::attr(content)").get()
yield variant

Data output is not the same as inside function

I am currently having an issue where I am trying to store data in a list (using dataclasses). When I print the data inside the list in the function (PullIncursionData()) it responded with a certain amount of numbers (never the same, not possible due to it's nature). When printing it after it being called to store it's return in a Var it somehow prints only the same number.
I cannot share the numbers, as they update with EVE Online's API, so the only way is to run it locally and read the first list yourself.
The repository is Here: https://github.com/AtherActive/EVEAPI-Demo
Heads up! Inside the main.py (the file with issues) (a snippet of code is down below) are more functions. All functions from line 90 and forward are important, the rest can be ignored for this question, as they do not interact with the other functions.
def PullIncursionData():
#Pulls data from URL and converts it into JSON
url = 'https://esi.evetech.net/latest/incursions/?datasource=tranquility'
data = rq.get(url)
jsData = data.json()
#Init var to store incursions
incursions = []
#Set lenght for loop. yay
length = len(jsData)
# Every loop incursion data will be read by __parseIncursionData(). It then gets added to var Incursions.
for i in range(length):
# Add data to var Incursion.
incursions.append(__parseIncursionData(jsData, i))
# If Dev mode, print some debug. Can be toggled in settings.py
if settings.developerMode == 1:
print(incursions[i].constellation_id)
return incursions
# Basically parses the input data in a decent manner. No comments needed really.
def __parseIncursionData(jsData, i):
icstruct = stru.Incursion
icstruct.constellation_id = jsData[i]['constellation_id']
icstruct.constellation_name = 'none'
icstruct.staging = jsData[i]['staging_solar_system_id']
icstruct.region_name = ResolveSystemNames(icstruct.constellation_id, 'con-reg')
icstruct.status = jsData[i]['state']
icstruct.systems_id = jsData[i]['infested_solar_systems']
icstruct.systems_names = ResolveSystemNames(jsData[i]['infested_solar_systems'], 'system')
return icstruct
# Resolves names for systems, regions and constellations. Still WIP.
def ResolveSystemNames(id, mode='constellation'):
#init value
output_name = 'none'
# If constellation, pull data and find region name.
if mode == 'con-reg':
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?constellationid={}&format=json'.format(id)
data = rq.get(url)
jsData = data.json()
output_name = jsData[0]['regionname']
# Pulls system name form Fuzzwork.co.uk.
elif mode == 'system':
#Convert output to a list.
output_name = []
lenght = len(id)
# Pulls system name from Fuzzwork. Not that hard.
for i in range(lenght):
url = 'https://www.fuzzwork.co.uk/api/mapdata.php?solarsystemid={}&format=json'.format(id[i])
data = rq.get(url)
jsData = data.json()
output_name.append(jsData[i]['solarsystemname'])
return output_name
icdata = PullIncursionData()
print('external data check:')
length = len(icdata)
for i in range(length):
print(icdata[i].constellation_id)
structures.py (custom file)
#dataclass
class Incursion:
constellation_id = int
constellation_name = str
staging = int
staging_name = str
systems_id = list
systems_names = list
region_name = str
status = str
def ___init___(self):
self.constellation_id = -1
self.constellation_name = 'undefined'
self.staging = -1
self.staging_name = 'undefined'
self.systems_id = []
self.systems_names = []
self.region_name = 'undefined'
self.status = 'unknown'

Python - Using a While Loop, how can I update a variable string to be used in a function

Background: I'm attempting to create a dataframe using data called from Twitch's API. They only allow 100 records per call so with each pull a new Pagination Cursor is offered in order to move on to the next page. I'm using the following code to try and efficiently pull this data rather than manually adjusting the after=(pagination value) in the get response. Right now the variable I'm trying to make dynamic is the 'Pagination' variable but it only gets updated once the loop finishes - not helpful! Take a look below and see if you notice anything I can change to achieve this goal. Any help is appreciated!
TwitchTopGamesDataFrame = [] #This is our Data List
BaseURL = 'https://api.twitch.tv/helix/games/top?first=100'
Headers = {'client-id':'lqctse0orgdbs5gdf5faz665api03r','Authorization': 'Bearer a1yl09mwmnwetp6ovocilheias8pzt'}
Indent = 2
Pagination = ''
FullURL = BaseURL + Pagination
Response = requests.get(FullURL,headers=Headers)
iterations = 1 # Data records returned are equivalent to iterations x100
#Loop: Response, Convert JSON data, Append to Data List, Get Pagination & Replace String in Variable - Iterate until 300 records
while count <= 3:
#Grab JSON Data, Convert, & Append
ResponseJSONData = Response.json()
#print(pgn) - Debug
pd.set_option('display.max_rows', None)
TopGamesDF = pd.DataFrame(ResponseJSONData['data'])
TopGamesDF = TopGamesDF[['id','name']]
TopGamesDF = TopGamesDF.rename(columns={'id':'GameID','name':'GameName'})
TopGamesDF['Rank'] = TopGamesDF.index + 1
TwitchTopGamesDataFrame.append(TopGamesDF)
#print(FullURL) - Debug
#Grab & Replace Pagination Value
ResponseJSONData['pagination']
RPagination = pd.DataFrame(ResponseJSONData['pagination'],index=[0])
pgn = str('&after='+RPagination.to_string(index=False,header=False).strip())
Pagination = pgn
#print(FullURL) - Debug
iterations += 1
TwitchTopGamesDataFrame```
Figured it out:
def top_games(page_count):
from time import gmtime, strftime
strftime("%Y-%m-%d %H:%M:%S", gmtime())
print("Time of Execution:", strftime("%Y-%m-%d %H:%M:%S", gmtime()))
#In order to condense the code above and be more efficient, a while/for loop would work great.
#Goal: Run a While Loop to create a larger DataFrame through Pagination as the Twitch API only allows for 100 records per call.
baseURL = 'https://api.twitch.tv/helix/games/top?first=100' #Base URL
Headers = {'client-id':'lqctse0orgdbs5gdf5faz665api03r','Authorization': 'Bearer a1yl09mwmnwetp6ovocilheias8pzt'}
Indent = 2
Pagination = ''
FullURL = BaseURL + Pagination
Response = requests.get(FullURL,headers=Headers)
start_count = 0
count = 0 # Data records returned are equivalent to iterations x100
max_count = page_count
#Loop: Response, Convert JSON data, Append to Data List, Get Pagination & Replace String in Variable - Iterate until 300 records
while count <= max_count:
#Grab JSON Data, Extend List
Pagination
FullURL = baseURL + Pagination
Response = requests.get(FullURL,headers=Headers)
ResponseJSONData = Response.json()
pd.set_option('display.max_rows', None)
if count == start_count:
TopGamesDFL = ResponseJSONData['data']
if count > start_count:
i = ResponseJSONData['data']
TopGamesDFL.extend(i)
#Grab & Replace Pagination Value
ResponseJSONData['pagination']
RPagination = pd.DataFrame(ResponseJSONData['pagination'],index=[0])
pgn = str('&after='+RPagination.to_string(index=False,header=False).strip())
Pagination = pgn
count += 1
if count == max_count:
FinalDataFrame = pd.DataFrame(TopGamesDFL)
FinalDataFrame = FinalDataFrame[['id','name']]
FinalDataFrame = FinalDataFrame.rename(columns={'id':'GameID','name':'GameName'})
FinalDataFrame['Rank'] = FinalDataFrame.index + 1
return FinalDataFrame

Issues with downloading dict into CSV accesing NY Times API via Python

I am currently trying to download a large number of NY Times articles using their API, based on Python 2.7. To do so, I was able to reuse a piece of code i found online:
[code]from nytimesarticle import articleAPI
api = articleAPI('...')
articles = api.search( q = 'Brazil',
fq = {'headline':'Brazil', 'source':['Reuters','AP', 'The New York Times']},
begin_date = '20090101' )
def parse_articles(articles):
'''
This function takes in a response to the NYT api and parses
the articles into a list of dictionaries
'''
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
# locations
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
# subject
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
def get_articles(date,query):
'''
This function accepts a year in string format (e.g.'1980')
and a query (e.g.'Amnesty International') and it will
return a list of parsed articles (in dictionaries)
for that year.
'''
all_articles = []
for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
articles = api.search(q = query,
fq = {'headline':'Brazil','source':['Reuters','AP', 'The New York Times']},
begin_date = date + '0101',
end_date = date + '1231',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
Download_all = []
for i in range(2009,2010):
print 'Processing' + str(i) + '...'
Amnesty_year = get_articles(str(i),'Brazil')
Download_all = Download_all + Amnesty_year
import csv
keys = Download_all[0].keys()
with open('brazil-mentions.csv', 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(Download_all)
Without the last bit (starting with "... import csv" this seems to be working fine. If I simply print my results, ("print Download_all") I can see them, however in a very unstructured way. Running the actual code i however get the message:
File "C:\Users\xxx.yyy\AppData\Local\Continuum\Anaconda2\lib\csv.py", line 148, in _dict_to_list
+ ", ".join([repr(x) for x in wrong_fields]))
ValueError: dict contains fields not in fieldnames: 'abstract'
Since I am quite a newbie at this, I would highly appreciate your help in guiding me how to download the news articles into a csv file in a structured way.
Thanks a lot in advance!
Best regards
Where you have:
keys = Download_all[0].keys()
This takes the column headers for the CSV from the dictionary for the first article. The problem is that the article dictionaries do not all have the same keys, so when you reach the first one that has the extra abstract key, it fails.
It looks like you'll have problems with abstract and snippet which are only added to the dictionary if they exist in the response.
You need to make keys equal to the superset of all possible keys:
keys = Download_all[0].keys() + ['abstract', 'snippet']
Or, ensure that every dict has a value for every field:
def parse_articles(articles):
...
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
else:
dic['abstract'] = ""
...
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
else:
dic['snippet'] = ""

Categories