Generate unique JSON file

Generate unique JSON file - python

I have the following json that I open to extract the content and retrieve the id that I store in a list.
f = open('content/dataProcessing_2022_02_23.json')
Item = json.load(f)['results']
df_Item = pd.DataFrame (Item, columns = ['id'])
List_Item = df_Item[['id'][0]].values.tolist()
List_Item
Then I create the dictionary where I am going to store the content I will generate in my following script
structureDictItem = {
"Item":[
"dataProcessingItem"
]
}
Here is my script:
def writeContentToFile(mode, customername, workspacename, category, endpoint, jsonContent):
path = os.path.join(os.getcwd(), customername, workspacename, category)
Path(path).mkdir(parents=True, exist_ok=True)
index = 1
while os.path.exists(path + "/" + (endpoint+ f'_{date}' if index == 1 else endpoint + f'_({index})_{date}') + '.json'): index += 1
with open(path + "/" + (endpoint+ f'_{date}' if index == 1 else endpoint + f'_({index})_{date}') + '.json', mode, encoding='utf-8') as f:
json.dump(jsonContent, f, ensure_ascii=False, indent=4)
f.close()
for categoryItem in structureDictItem:
for endpointItem in structureDictItem[categoryItem]:
endpointFilenameItem = endpointItem
url = DataGalaxy_url + endpointFilenameItem
params = {"versionId":Workspace_id,
"includeAccessData":"true",
"includeAttributes":"true",
"includeLinks":"true",
"limit":5000
}
jsonResponse = requests.get(url, params=params, headers={"Authorization":accessToken}).json()
writeContentToFile('a', customername, workspacename, categoryItem, endpointFilenameItem, jsonResponse)
try:
for item in List_Item:
params["dataProcessingId"] = item
jsonResponse = requests.get(url=url, params = params, headers={"Authorization":accessToken}).json()['results']
writeContentToFile('a', customername, workspacename, categoryItem, endpointFilenameItem, jsonResponse)
except:
print(endpointItem)
next
However the following result I get is not really the final result I am expecting. Indeed I wish to have all the content in the same JSON. I understand why I am getting this output it is because I have 17 ID so its generating 17 different JSON. I would like help to see how I am able to generate into a single JSON. Can someone give me a hint on it or have an idea what I need to add to my script ?
dataProcessingItem_(10)_2022_02_23.json
dataProcessingItem_(11)_2022_02_23.json
dataProcessingItem_(12)_2022_02_23.json
dataProcessingItem_(13)_2022_02_23.json
dataProcessingItem_(14)_2022_02_23.json
dataProcessingItem_(15)_2022_02_23.json
dataProcessingItem_(16)_2022_02_23.json
dataProcessingItem_(17)_2022_02_23.json
dataProcessingItem_(18)_2022_02_23.json
dataProcessingItem_(2)_2022_02_23.json
dataProcessingItem_(3)_2022_02_23.json
dataProcessingItem_(4)_2022_02_23.json
dataProcessingItem_(5)_2022_02_23.json
dataProcessingItem_(6)_2022_02_23.json
dataProcessingItem_(7)_2022_02_23.json
dataProcessingItem_(8)_2022_02_23.json
dataProcessingItem_(9)_2022_02_23.json
Desired output :
dataProcessingItem_2022_02_23.json

Related

Remove row from the CSV file if condition met

I am trying to scrape pickels.com.au.
I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file.
What would be the best way to remove and update the row in the CSV file.
Below is my code...
import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep
with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
auction_csv_writer = csv.writer(auction_csv_file)
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('M ake')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
sale_price = auction_data.get('MinimumBid')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, "${:,.2f}".format(sale_price).strip() ,
link, sale_date
]
with open('pickels_dataset.csv', 'r+') as csv_read:
auction_reader = list(csv.reader(csv_read))
for each in auction_reader:
if link in each:
each_link, each_price = each[9], each[0]
if (link == each_link) and (sale_price != each_price):
auction_reader.clear()
print('New list found, old list deleted')
auction_csv_writer.writerow(auction_values)
print('New value added')
continue
elif (link == each[9]) and (sale_price == each[0]):
print('Same result already exist in the file')
continue
else:
auction_csv_writer.writerow(auction_values)
print('Unique result found and added.')
break

Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected.
A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction.
Next scrape the current auctions and update the saved_auctions dictionary as needed.
Finally at the end, write the contents of saved_auctions back to the CSV file.
For example:
import requests
from scrapy.selector import Selector
import csv
import re
auction_filename = 'pickels_dataset.csv'
# Load existing auctions into a dictionary with link as key
saved_auctions = {}
with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
for row in csv.reader(f_auction_file):
saved_auctions[row[9]] = row # dictionary key is link
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('Make')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
minimum_bid = auction_data.get('MinimumBid')
sale_price = "${:,.2f}".format(minimum_bid).strip()
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, sale_price,
link, sale_date
]
if link in saved_auctions:
if saved_auctions[link][8] == sale_price:
print('Same result already exists in the file')
else:
print('New value updated')
saved_auctions[link] = auction_values # Updated the entry
else:
print('New auction added')
saved_auctions[link] = auction_values
# Update the saved auction file
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
csv_auction_file = csv.writer(f_auction_file)
csv_auction_file.writerows(saved_auctions.values())
If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image

Python implementation with github

How I can fix the following bug for the comments from commits that has been merged are not downloaded? I think there is a problem from REST to GraphQL but I don't know exactly how to fix.
The following script is used for:
Get all all quantified activities from a set of GitHub repositories. The
list of repos to be searched for are found in the repos.lst file.
Authentication to GitHub is also stored in separate file called github.token
containing the username and password/access token on two separate lines.
Quantified activities include merged pull-requests, closed issues (except for
those explicitly removed from the list for not being constructive) and comments.
Pull-requests:
Only the closed pull-requests are listed, and their merge status determined,
finally the merged pull-requests are stored in a JSON file, with entries
containing the reference number, the repository, the title, the author and the
pull-request URL.
Issues:
Only the closed issues are listed, the pull-requests (which are treated also as
issues by the GitHub API) removed from them, isseues blacklisted in the
blacklist.lst file are also removed from the list, finally the remaining
issues are stored in a JSON file, with entries containing the reference number,
the repository, the title, the author and the issue URL.
Comments:
Comments from the commits, from the issues and from the pull-requests are all
listed and stored in JSON file with entries containing the author, the comment
ID, the repository, the comment content and the comment's URL. Issues comments
and pull-request comments will also contain the reference number of the issue
respectively the pull-request, with issues additionally having also the
original author of the issue, while the commit comments will contain the SHA1 of
the commit.
#!/usr/bin/env python3
# requests library is used to access GitHub API
import requests
import json
import re
import sys
import os.path
import bisect
import getpass
from itertools import chain
import repos
year = repos.get_year()
path = '../../{}/rezultate/'.format(year)
# read the list of repositories
repo_name = re.compile(r'([\w-]+)/([\w-]+)')
repos = [('etc-so', 'syllabus')]
with open(path + 'data/repos.lst', 'r') as f:
for s in f:
s = s.strip()
# ignore empty lines
if s and not s.startswith('#'):
m = repo_name.fullmatch(s)
if not m:
print("Invalid repo name: " + s)
repos.append((m.group(1), m.group(2)))
# read the list of students
students = []
with open(path + 'data/students.json', 'r') as f:
students = [x['user'] for x in json.load(f)]
if not students:
print("No students to check for")
sys.exit(1)
# get the access token
if os.path.exists('github.token'):
with open('github.token', 'r') as f:
auth = (f.readline().strip(), f.readline().strip())
else:
auth = (input("Enter GitHub username: "), getpass.getpass('Password: '))
# construct a labels list, so it can be added to the comments as well
issue_labels = {}
# get the persistent PR, issue and comment lists
if os.path.exists(path + 'data/pulls.json'):
with open(path + 'data/pulls.json', 'r') as f:
saved_pulls = dict((x['id'],(x['week'], x['value'], x['author']) + ((x['multi'],) if 'multi' in x else ())) for x in json.load(f))
else:
saved_pulls = {}
if os.path.exists(path + 'data/issues.json'):
with open(path + 'data/issues.json', 'r') as f:
saved_issues = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_issues = {}
if os.path.exists(path + 'data/comments.json'):
with open(path + 'data/comments.json', 'r') as f:
saved_comments = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_comments = {}
current_week = 0
# if there were already some activity then move ahead to the last week
if saved_pulls or saved_issues or saved_comments:
current_week = max(x[0] for x in chain(saved_pulls.values(),
saved_issues.values(),
saved_comments.values()))
# if there is a command line argument use it as current week
if len(sys.argv) == 2:
if sys.argv[1].isdigit():
current_week = int(sys.argv[1])
# -n increments the current week if it can be obtained from the activity
elif sys.argv[1] == '-n' and current_week != 0:
current_week += 1
print("Switching to week {}".format(current_week))
elif sys.argv[1] == '?':
print("Current week is {}".format(current_week))
sys.exit(0)
else:
print("Invalid command line parameter")
sys.exit(1)
elif len(sys.argv) > 2:
print("Too many parameters")
sys.exit(1)
# if no current week was obtained, start with week 1
if not current_week:
current_week = 1
api_url = "https://api.github.com/"
get_url = "repos/{}/{}/{}"
c_params = {
'state': 'closed', # get closed pull-requests/issues
'sort': 'created',
'direction': 'asc', # sort it in ascending order by their creation time
}
pr = []
pr_times = []
gi = []
comments = []
c_times = []
authors = {}
issue_comment_re = re.compile(r'.*/(pull|issues)/([0-9]+)#.*')
link_rel = re.compile(r'<([^>]+)>; rel="([\w]+)"(?:, )?')
request_count = 0
def github_get(get_code, **params):
global request_count
global raw_data
res = []
url = api_url + get_url.format(org, repo, get_code)
while url:
r = requests.get(url, auth=auth, params=params)
if r.status_code != 200:
print('Status code {} received'.format(r.status_code))
print(r.content)
sys.exit(1)
links = dict((m[1],m[0]) for m in link_rel.findall(r.headers.get('link', '')))
url = links.get('next')
res.extend(r.json())
request_count = r.headers['X-RateLimit-Remaining']
return res
label_values = {
'bug': 2,
'enhancement': 2,
'challenge': 2,
'help wanted': 1,
'invalid': -1,
'duplicate': -2
}
def label_value(labels):
# check predefined values
values = [label_values[x] for x in labels if x in label_values]
values += [x.count('★') for x in labels]
if values:
if min(values) < 0:
return min(values)
value = max(values)
else:
value = 0
# add all white stars
value += sum(x.count('☆') for x in labels)
return value
def issue_value(value, issue_id=None):
if issue_id:
old_value = saved_issues.get(issue_id)
if old_value and old_value[1] != value:
print("Value change detected in issue {}".format(issue_id))
return value
def pull_value(label, pull_id=None, ref=0):
if label:
value = label_value(label)
else:
value = None
print("No label for pull-request {}".format(ref))
if pull_id:
old_value = saved_pulls.get(pull_id)
if old_value and old_value[1] is not None and old_value[1] != value:
print("Value change detected in pull-request {}".format(ref))
if value is None:
value = old_value[1]
return value
ptotal = 0
itotal = 0
ctotal = 0
# pass through the repos
for org,repo in repos:
print('Processing repo {}:'.format(repo), end=' ', flush=True)
# get all the issues, do this first as it all includes the pull-requests
# for which only here we can determine the labels
issues = github_get('issues', state='all', sort='created', direction='asc')
icount = 0;
for i in issues:
ref = int(i['number'])
author = i['user']['login']
authors[(repo, ref)] = author
label = [x['name'] for x in i['labels']]
issue_labels[(repo, ref)] = label
if 'pull_request' not in i and author in students:
gi.append(
{
'id': i['id'],
'repo': repo,
'ref': ref,
'title': i['title'],
'url': i['html_url'],
'author': author,
'label': label,
'week' : saved_issues.get(i['id'], [current_week])[0],
'value' : issue_value(label_value(label), i['id'])
})
icount += 1
# get the merged pull-requests
pulls = github_get('pulls', state='closed')
pcount = 0;
#print(r.headers)
for p in pulls:
ref = int(p['number'])
author = p['user']['login']
label = issue_labels.get((repo, ref), [])
if p['merged_at'] and author in students and (not label or label_value(label) >= 0):
i = bisect.bisect_right(pr_times, p['merged_at'])
pr_times.insert(i, p['merged_at'])
# check authors of included commits
if p['id'] in saved_pulls:
author = saved_pulls[p['id']][2]
else:
pcommits = github_get('pulls/{}/commits'.format(ref))
pc_authors = [author]
for a in pcommits:
al = a['author'].get('login') if a['author'] else None
cl = a['committer'].get('login') if a['committer'] else None
if al == cl or cl == 'web-flow' or cl == 'kcs':
aa = al
else:
aa = ':'.join(x for x in [al, cl] if x)
if aa and aa not in pc_authors and aa != 'kcs':
pc_authors.append(aa)
if len(pc_authors) != 1:
author = pc_authors
pr.insert(i, {
'id': p['id'],
'repo': repo,
'ref': ref,
'title': p['title'],
'url': p['html_url'],
'label': label,
'author': author,
'week': saved_pulls.get(p['id'], [current_week])[0],
'value': pull_value(label, p['id'], ref)
})
if len(saved_pulls.get(p['id'], [])) > 3:
pr[i]['multi'] = saved_pulls[p['id']][3]
pcount += 1
# and now for the comments:
# this is more troublesome as constructive comments must be selected
# manually, so we are keeping persistent JSON file for the comments
# holding a valid tag, newly downloaded comments have this tag unset
# and they can be validated afterwards manually (or by script)
ccount = 0;
# 3 types of comments exist on GitHub: issue comments, review comments
# and commit comments, all have to be handled separately
ccomments = github_get('comments', per_page=50, sort='created')
#with open('all_ccomments.json', 'w') as f:
# json.dump(ccomments, f, indent=4, sort_keys=True)
for c in ccomments:
author = c['user']['login']
if author in students:
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'commit': c['commit_id'],
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
icomments = github_get('issues/comments', per_page=50, sort='created')
for c in icomments:
author = c['user']['login']
if author in students:
url = c['html_url']
m = issue_comment_re.fullmatch(url)
if not m:
print("Problem parsing issue url " + url)
sys.exit(1)
ref = int(m.group(2))
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'issue': ref,
'msg': c['body'],
'url': url,
'author': author,
'issue_author': authors[(repo, ref)],
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
if m.group(1) == 'issues' and (repo, ref) in issue_labels:
comments[i]['label'] = issue_labels[(repo, ref)]
ccount += 1
pcomments = github_get('pulls/comments', per_page=50, sort='created')
for c in pcomments:
author = c['user']['login']
if author in students:
ref = int(c['pull_request_url'].rsplit('/', 1)[1])
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'pull': ref,
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
print('found {} merged pull-requests, {} issues and {} comments'.format(pcount, icount, ccount))
ptotal += pcount
itotal += icount
ctotal += ccount
with open(path + 'data/pulls.json', 'w') as f:
json.dump(pr, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/issues.json', 'w') as f:
json.dump(gi, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/comments.json', 'w') as f:
json.dump(comments, f, indent=4, sort_keys=True, ensure_ascii=False)
print("Total: {} merged pull-requests, {} issues and {} comments [{} contributions]".format(
ptotal, itotal, ctotal, ptotal + itotal + ctotal))
print("Remaining request count: {}".format(request_count))
repo_key = dict((r[1],i) for i,r in enumerate(repos))
def sort_repos(x):
'''Sort the repos in a blacklist with (repo,ref) structure in the order
as they appear in the repos list.
If repo is not in the list then put them afterwards
'''
if x[0] in repo_key:
return (repo_key[x[0]],x[1])
else:
return (len(repos),) + x

Return multiple items

i am new to python as a matter of fact, this is my first python project. I am using ebaysdk to search for electronics on ebay and i want it to return multiple results because my app is for comparing prices but it returns only one result.
Someone please help me to make the code return multiple results.
Here is my code snippet.
#app.route('/ebay_page_post', methods=['GET', 'POST'])
def ebay_page_post():
if request.method == 'POST':
#Get json format of the text sent by Ajax
search = request.json['search']
try:
#ebaysdk code starts here
api = finding(appid='JohnOkek-hybridse-PRD-5c2330105-9bbb62f2', config_file = None)
api_request = {'keywords':search, 'outputSelector': 'SellerInfo', 'categoryId': '293'}
response = api.execute('findItemsAdvanced', api_request)
soup = BeautifulSoup(response.content, 'lxml')
totalentries = int(soup.find('totalentries').text)
items = soup.find_all('item')
for item in items:
cat = item.categoryname.string.lower()
title = item.title.string.lower().strip()
price = int(round(float(item.currentprice.string)))
url = item.viewitemurl.string.lower()
seller = item.sellerusername.text.lower()
listingtype = item.listingtype.string.lower()
condition = item.conditiondisplayname.string.lower()
print ('____________________________________________________________')
#return json format of the result for Ajax processing
return jsonify(cat + '|' + title + '|' + str(price) + '|' + url + '|' + seller + '|' + listingtype + '|' + condition)
except ConnectionError as e:
return jsonify(e)

Based on the code you provided, added the key value pair collection example you could use :
#app.route('/ebay_page_post', methods=['GET', 'POST'])
def ebay_page_post():
if request.method == 'POST':
#Get json format of the text sent by Ajax
search = request.json['search']
try:
#ebaysdk code starts here
api = finding(appid='JohnOkek-hybridse-PRD-5c2330105-9bbb62f2', config_file = None)
api_request = {'keywords':search, 'outputSelector': 'SellerInfo', 'categoryId': '293'}
response = api.execute('findItemsAdvanced', api_request)
soup = BeautifulSoup(response.content, 'lxml')
totalentries = int(soup.find('totalentries').text)
items = soup.find_all('item')
# This will be returned
itemsFound = {}
# This index will be incremented
# each time an item is added
index = 0
for item in items:
cat = item.categoryname.string.lower()
title = item.title.string.lower().strip()
price = int(round(float(item.currentprice.string)))
url = item.viewitemurl.string.lower()
seller = item.sellerusername.text.lower()
listingtype = item.listingtype.string.lower()
condition = item.conditiondisplayname.string.lower()
# Adding the item found in the collection
# index is the key and the item json is the value
itemsFound[index] = jsonify(cat + '|' + title + '|' + str(price) + '|' + url + '|' + seller + '|' + listingtype + '|' + condition)
# Increment the index for the next items key
index++
for key in itemsFound:
print key, ':', itemsFound[key
# return itemsFound
except ConnectionError as e:
return jsonify(e)

Once the first item is found, add it to the collection. After your for loop finishes, then return the collection.
Right now you are returning (breaking the iteration) once you have found the first

I was able to solve the problem.
Click here to see how i did it
Thanks to every contributor, i am most grateful to you all.

How to change the value of the parameter in python?

NOTE: There is no fix url for it. Means it is not possible to see this url always. I want code which works for all the urls.
For ex, http://januapp.com/demo/search.php?search=aaa
http://januapp.com/demo/search.php?other=aaa
Now I want to change it to
http://januapp.com/demo/search.php?search=bbb
http://januapp.com/demo/search.php?other=bbb
I don't know how can I do it?
I tried this
import optparse
import requests
import urlparse
parser = optparse.OptionParser()
parser.add_option("-t","--Host", dest="Target", help="Please provide the target", default="true")
options, args = parser.parse_args()
url = options.Target
xss = []
xss.append("bbb")
try:
url2 =urlparse.urlparse(url)
print url2
url3 = urlparse.parse_qs(url2.query)
parametervalue = [key for key, key in url3.iteritems()] #[['aaa']]
parsed = parametervalue.append(xss[0])
print parsed
finalurl = urljoin(url, parsed)
print finalurl
except Exception as e:
print e
So when I pass this
xss3.py -t http://januapp.com/demo/search.php?search=aaa
The Error occurs below on to the cmd
ParseResult(scheme='http', netloc='januapp.com', path='/demo/search.php', params='', query='search=aaa', fragment='')
None
name 'urljoin' is not defined
See the None
Now that's the problem,
I am using Python2.7.
Thank you very much. Hope you get the problem.

You can try something with this kind of approach.
url = 'http://januapp.com/demo/search.php?search=aaa'
# First get all your query params
arr = url.split('?')
base_url = arr[0] # This is your base url i.e. 'http://januapp.com/demo/search.php'
params = arr[1] # here are your query params ['search=aaa']
# Now seprate out all the query parameters and their values
arr2 = params.split("=") # This will give you somrthing like this : ['search', 'aaa'], the the value will be next to the key
# This is a dictonary to hold the key value pairs
param_value_dict = {} # {'search': 'aaa'}
for i, str in enumerate(arr2):
if i % 2 == 0:
param_value_dict[str] = arr2[i + 1]
# now if you want to chnage the value of search from 'aaa' to 'bbb', then just change it in the dictonary
param_value_dict['search'] = 'bbb'
# now form the new url from the dictonary
new_url = base_url + '?'
for param_name, param_value in param_value_dict.items():
new_url = new_url + param_name + "=" + param_value + "&"
# remove the extra '&'
new_url = new_url[:len(new_url) - 1]
print(new_url)

How about:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search="
print a+ext
Where ext is what you want to search for, a is the link and just add them together.
Or you could replace values like this:
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
print a.replace('aaa', ext)
Using regex:
import re
ext = "bbb"
a = "http://januapp.com/demo/search.php?search=aaa"
b=re.compile(r".+search=")
print re.search(b,a).group()+ext

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Generate unique JSON file - python

Related

Remove row from the CSV file if condition met

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

Python implementation with github

Return multiple items

How to change the value of the parameter in python?

Categories

Resources