Python implementation with github - python

How I can fix the following bug for the comments from commits that has been merged are not downloaded? I think there is a problem from REST to GraphQL but I don't know exactly how to fix.
The following script is used for:
Get all all quantified activities from a set of GitHub repositories. The
list of repos to be searched for are found in the repos.lst file.
Authentication to GitHub is also stored in separate file called github.token
containing the username and password/access token on two separate lines.
Quantified activities include merged pull-requests, closed issues (except for
those explicitly removed from the list for not being constructive) and comments.
Pull-requests:
Only the closed pull-requests are listed, and their merge status determined,
finally the merged pull-requests are stored in a JSON file, with entries
containing the reference number, the repository, the title, the author and the
pull-request URL.
Issues:
Only the closed issues are listed, the pull-requests (which are treated also as
issues by the GitHub API) removed from them, isseues blacklisted in the
blacklist.lst file are also removed from the list, finally the remaining
issues are stored in a JSON file, with entries containing the reference number,
the repository, the title, the author and the issue URL.
Comments:
Comments from the commits, from the issues and from the pull-requests are all
listed and stored in JSON file with entries containing the author, the comment
ID, the repository, the comment content and the comment's URL. Issues comments
and pull-request comments will also contain the reference number of the issue
respectively the pull-request, with issues additionally having also the
original author of the issue, while the commit comments will contain the SHA1 of
the commit.
#!/usr/bin/env python3
# requests library is used to access GitHub API
import requests
import json
import re
import sys
import os.path
import bisect
import getpass
from itertools import chain
import repos
year = repos.get_year()
path = '../../{}/rezultate/'.format(year)
# read the list of repositories
repo_name = re.compile(r'([\w-]+)/([\w-]+)')
repos = [('etc-so', 'syllabus')]
with open(path + 'data/repos.lst', 'r') as f:
for s in f:
s = s.strip()
# ignore empty lines
if s and not s.startswith('#'):
m = repo_name.fullmatch(s)
if not m:
print("Invalid repo name: " + s)
repos.append((m.group(1), m.group(2)))
# read the list of students
students = []
with open(path + 'data/students.json', 'r') as f:
students = [x['user'] for x in json.load(f)]
if not students:
print("No students to check for")
sys.exit(1)
# get the access token
if os.path.exists('github.token'):
with open('github.token', 'r') as f:
auth = (f.readline().strip(), f.readline().strip())
else:
auth = (input("Enter GitHub username: "), getpass.getpass('Password: '))
# construct a labels list, so it can be added to the comments as well
issue_labels = {}
# get the persistent PR, issue and comment lists
if os.path.exists(path + 'data/pulls.json'):
with open(path + 'data/pulls.json', 'r') as f:
saved_pulls = dict((x['id'],(x['week'], x['value'], x['author']) + ((x['multi'],) if 'multi' in x else ())) for x in json.load(f))
else:
saved_pulls = {}
if os.path.exists(path + 'data/issues.json'):
with open(path + 'data/issues.json', 'r') as f:
saved_issues = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_issues = {}
if os.path.exists(path + 'data/comments.json'):
with open(path + 'data/comments.json', 'r') as f:
saved_comments = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_comments = {}
current_week = 0
# if there were already some activity then move ahead to the last week
if saved_pulls or saved_issues or saved_comments:
current_week = max(x[0] for x in chain(saved_pulls.values(),
saved_issues.values(),
saved_comments.values()))
# if there is a command line argument use it as current week
if len(sys.argv) == 2:
if sys.argv[1].isdigit():
current_week = int(sys.argv[1])
# -n increments the current week if it can be obtained from the activity
elif sys.argv[1] == '-n' and current_week != 0:
current_week += 1
print("Switching to week {}".format(current_week))
elif sys.argv[1] == '?':
print("Current week is {}".format(current_week))
sys.exit(0)
else:
print("Invalid command line parameter")
sys.exit(1)
elif len(sys.argv) > 2:
print("Too many parameters")
sys.exit(1)
# if no current week was obtained, start with week 1
if not current_week:
current_week = 1
api_url = "https://api.github.com/"
get_url = "repos/{}/{}/{}"
c_params = {
'state': 'closed', # get closed pull-requests/issues
'sort': 'created',
'direction': 'asc', # sort it in ascending order by their creation time
}
pr = []
pr_times = []
gi = []
comments = []
c_times = []
authors = {}
issue_comment_re = re.compile(r'.*/(pull|issues)/([0-9]+)#.*')
link_rel = re.compile(r'<([^>]+)>; rel="([\w]+)"(?:, )?')
request_count = 0
def github_get(get_code, **params):
global request_count
global raw_data
res = []
url = api_url + get_url.format(org, repo, get_code)
while url:
r = requests.get(url, auth=auth, params=params)
if r.status_code != 200:
print('Status code {} received'.format(r.status_code))
print(r.content)
sys.exit(1)
links = dict((m[1],m[0]) for m in link_rel.findall(r.headers.get('link', '')))
url = links.get('next')
res.extend(r.json())
request_count = r.headers['X-RateLimit-Remaining']
return res
label_values = {
'bug': 2,
'enhancement': 2,
'challenge': 2,
'help wanted': 1,
'invalid': -1,
'duplicate': -2
}
def label_value(labels):
# check predefined values
values = [label_values[x] for x in labels if x in label_values]
values += [x.count('★') for x in labels]
if values:
if min(values) < 0:
return min(values)
value = max(values)
else:
value = 0
# add all white stars
value += sum(x.count('☆') for x in labels)
return value
def issue_value(value, issue_id=None):
if issue_id:
old_value = saved_issues.get(issue_id)
if old_value and old_value[1] != value:
print("Value change detected in issue {}".format(issue_id))
return value
def pull_value(label, pull_id=None, ref=0):
if label:
value = label_value(label)
else:
value = None
print("No label for pull-request {}".format(ref))
if pull_id:
old_value = saved_pulls.get(pull_id)
if old_value and old_value[1] is not None and old_value[1] != value:
print("Value change detected in pull-request {}".format(ref))
if value is None:
value = old_value[1]
return value
ptotal = 0
itotal = 0
ctotal = 0
# pass through the repos
for org,repo in repos:
print('Processing repo {}:'.format(repo), end=' ', flush=True)
# get all the issues, do this first as it all includes the pull-requests
# for which only here we can determine the labels
issues = github_get('issues', state='all', sort='created', direction='asc')
icount = 0;
for i in issues:
ref = int(i['number'])
author = i['user']['login']
authors[(repo, ref)] = author
label = [x['name'] for x in i['labels']]
issue_labels[(repo, ref)] = label
if 'pull_request' not in i and author in students:
gi.append(
{
'id': i['id'],
'repo': repo,
'ref': ref,
'title': i['title'],
'url': i['html_url'],
'author': author,
'label': label,
'week' : saved_issues.get(i['id'], [current_week])[0],
'value' : issue_value(label_value(label), i['id'])
})
icount += 1
# get the merged pull-requests
pulls = github_get('pulls', state='closed')
pcount = 0;
#print(r.headers)
for p in pulls:
ref = int(p['number'])
author = p['user']['login']
label = issue_labels.get((repo, ref), [])
if p['merged_at'] and author in students and (not label or label_value(label) >= 0):
i = bisect.bisect_right(pr_times, p['merged_at'])
pr_times.insert(i, p['merged_at'])
# check authors of included commits
if p['id'] in saved_pulls:
author = saved_pulls[p['id']][2]
else:
pcommits = github_get('pulls/{}/commits'.format(ref))
pc_authors = [author]
for a in pcommits:
al = a['author'].get('login') if a['author'] else None
cl = a['committer'].get('login') if a['committer'] else None
if al == cl or cl == 'web-flow' or cl == 'kcs':
aa = al
else:
aa = ':'.join(x for x in [al, cl] if x)
if aa and aa not in pc_authors and aa != 'kcs':
pc_authors.append(aa)
if len(pc_authors) != 1:
author = pc_authors
pr.insert(i, {
'id': p['id'],
'repo': repo,
'ref': ref,
'title': p['title'],
'url': p['html_url'],
'label': label,
'author': author,
'week': saved_pulls.get(p['id'], [current_week])[0],
'value': pull_value(label, p['id'], ref)
})
if len(saved_pulls.get(p['id'], [])) > 3:
pr[i]['multi'] = saved_pulls[p['id']][3]
pcount += 1
# and now for the comments:
# this is more troublesome as constructive comments must be selected
# manually, so we are keeping persistent JSON file for the comments
# holding a valid tag, newly downloaded comments have this tag unset
# and they can be validated afterwards manually (or by script)
ccount = 0;
# 3 types of comments exist on GitHub: issue comments, review comments
# and commit comments, all have to be handled separately
ccomments = github_get('comments', per_page=50, sort='created')
#with open('all_ccomments.json', 'w') as f:
# json.dump(ccomments, f, indent=4, sort_keys=True)
for c in ccomments:
author = c['user']['login']
if author in students:
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'commit': c['commit_id'],
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
icomments = github_get('issues/comments', per_page=50, sort='created')
for c in icomments:
author = c['user']['login']
if author in students:
url = c['html_url']
m = issue_comment_re.fullmatch(url)
if not m:
print("Problem parsing issue url " + url)
sys.exit(1)
ref = int(m.group(2))
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'issue': ref,
'msg': c['body'],
'url': url,
'author': author,
'issue_author': authors[(repo, ref)],
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
if m.group(1) == 'issues' and (repo, ref) in issue_labels:
comments[i]['label'] = issue_labels[(repo, ref)]
ccount += 1
pcomments = github_get('pulls/comments', per_page=50, sort='created')
for c in pcomments:
author = c['user']['login']
if author in students:
ref = int(c['pull_request_url'].rsplit('/', 1)[1])
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'pull': ref,
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
print('found {} merged pull-requests, {} issues and {} comments'.format(pcount, icount, ccount))
ptotal += pcount
itotal += icount
ctotal += ccount
with open(path + 'data/pulls.json', 'w') as f:
json.dump(pr, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/issues.json', 'w') as f:
json.dump(gi, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/comments.json', 'w') as f:
json.dump(comments, f, indent=4, sort_keys=True, ensure_ascii=False)
print("Total: {} merged pull-requests, {} issues and {} comments [{} contributions]".format(
ptotal, itotal, ctotal, ptotal + itotal + ctotal))
print("Remaining request count: {}".format(request_count))
repo_key = dict((r[1],i) for i,r in enumerate(repos))
def sort_repos(x):
'''Sort the repos in a blacklist with (repo,ref) structure in the order
as they appear in the repos list.
If repo is not in the list then put them afterwards
'''
if x[0] in repo_key:
return (repo_key[x[0]],x[1])
else:
return (len(repos),) + x

Related

How to properly compare 2 JSON request response strings in Python

I want to compare 2 Python response strings and print out the differences, here is my code right now:
import requests
import json
import time
getFirst = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1")
liveRN = json.dumps(getFirst.json(), indent=4)
while True:
get = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1")
dataPretty = json.dumps(get.json(), indent=4)
data = get.json()
if get.status_code == 200:
print("ok")
if dataPretty != data:
for item in data:
if str(item) not in liveRN:
send = 1
print(f"Found difference: {item}")
symbol = item['symbol']
img = item['image']
name = item['name']
description = item['description']
print(symbol)
print(img)
print(name)
else:
print("Didnt find")
else:
print("No change")
time.sleep(15)
I only want to print the items when the two repsonses dont match but right now its printing the items I want even when the strings do match.
I tried to see add another if condition where if the 2 request response match it wont do anything and just pass but that didnt work
You can use sets to find whether items of the dictionary are changed or not. I've used the compare code from another question but this is somewhat what you can use for your problem
import requests
import time
def dict_compare(d1, d2):
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
shared_keys = d1_keys.intersection(d2_keys)
added = d1_keys - d2_keys
removed = d2_keys - d1_keys
modified = {o : (d1[o], d2[o]) for o in shared_keys if d1[o] != d2[o]}
same = set(o for o in shared_keys if d1[o] == d2[o])
return added, removed, modified, same
first = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1").json()[0]
while True:
get_second = requests.get("https://api-mainnet.magiceden.dev/v2/collections?offset=0&limit=1")
if get_second.status_code == 200:
print("ok")
second = get_second.json()[0]
added, removed, modified, same = dict_compare(first, second)
if len(added) > 0 or len(modified) > 0 or len(removed) > 0:
print("added: ", added)
print("modified: ", modified)
print("removed: ", removed)
else:
print("No change")
time.sleep(15)

Generate unique JSON file

I have the following json that I open to extract the content and retrieve the id that I store in a list.
f = open('content/dataProcessing_2022_02_23.json')
Item = json.load(f)['results']
df_Item = pd.DataFrame (Item, columns = ['id'])
List_Item = df_Item[['id'][0]].values.tolist()
List_Item
Then I create the dictionary where I am going to store the content I will generate in my following script
structureDictItem = {
"Item":[
"dataProcessingItem"
]
}
Here is my script:
def writeContentToFile(mode, customername, workspacename, category, endpoint, jsonContent):
path = os.path.join(os.getcwd(), customername, workspacename, category)
Path(path).mkdir(parents=True, exist_ok=True)
index = 1
while os.path.exists(path + "/" + (endpoint+ f'_{date}' if index == 1 else endpoint + f'_({index})_{date}') + '.json'): index += 1
with open(path + "/" + (endpoint+ f'_{date}' if index == 1 else endpoint + f'_({index})_{date}') + '.json', mode, encoding='utf-8') as f:
json.dump(jsonContent, f, ensure_ascii=False, indent=4)
f.close()
for categoryItem in structureDictItem:
for endpointItem in structureDictItem[categoryItem]:
endpointFilenameItem = endpointItem
url = DataGalaxy_url + endpointFilenameItem
params = {"versionId":Workspace_id,
"includeAccessData":"true",
"includeAttributes":"true",
"includeLinks":"true",
"limit":5000
}
jsonResponse = requests.get(url, params=params, headers={"Authorization":accessToken}).json()
writeContentToFile('a', customername, workspacename, categoryItem, endpointFilenameItem, jsonResponse)
try:
for item in List_Item:
params["dataProcessingId"] = item
jsonResponse = requests.get(url=url, params = params, headers={"Authorization":accessToken}).json()['results']
writeContentToFile('a', customername, workspacename, categoryItem, endpointFilenameItem, jsonResponse)
except:
print(endpointItem)
next
However the following result I get is not really the final result I am expecting. Indeed I wish to have all the content in the same JSON. I understand why I am getting this output it is because I have 17 ID so its generating 17 different JSON. I would like help to see how I am able to generate into a single JSON. Can someone give me a hint on it or have an idea what I need to add to my script ?
dataProcessingItem_(10)_2022_02_23.json
dataProcessingItem_(11)_2022_02_23.json
dataProcessingItem_(12)_2022_02_23.json
dataProcessingItem_(13)_2022_02_23.json
dataProcessingItem_(14)_2022_02_23.json
dataProcessingItem_(15)_2022_02_23.json
dataProcessingItem_(16)_2022_02_23.json
dataProcessingItem_(17)_2022_02_23.json
dataProcessingItem_(18)_2022_02_23.json
dataProcessingItem_(2)_2022_02_23.json
dataProcessingItem_(3)_2022_02_23.json
dataProcessingItem_(4)_2022_02_23.json
dataProcessingItem_(5)_2022_02_23.json
dataProcessingItem_(6)_2022_02_23.json
dataProcessingItem_(7)_2022_02_23.json
dataProcessingItem_(8)_2022_02_23.json
dataProcessingItem_(9)_2022_02_23.json
Desired output :
dataProcessingItem_2022_02_23.json

return a list of word from a texte file with python

I work on a project on python.
I want to return a list of name from a text file.
I start with one name I know.
My text file is like :
ALPHA;n10;Output
ALPHA;n11;Input
ALPHA;n12;Input
BETA;n10;Input
BETA;n14;Input
CHARLIE;n10;Input
CHARLIE;n13;Output
DELTA;n13;Output
DELTA;n12;Input
Let's say I start from the name ALPHA and I know it's an Output.
So I have to search the number link to this name which is n10.
I want to return all the name of the number n10 which are in Input.
So at the end I want the list ["BETA", "CHARLIE"]
For the moment I code the following function :
file = "path of the texte file"
name = "ALPHA"
liste_new_name = []
def search_new_name(liste):
file_txt = open(file, "r")
contenu = file_txt.readline()
file_txt.close()
if contenu.split(";")[0] == name and ";Output" in contenu:
num = contenu.split(";")[1]
if num in contenu and ";Input" in contenu:
liste.append(contenu.split(";")[0]
return liste
print(liste)
else:
print("No new name found")
else:
print("No num found")
search_new_name(liste_new_name)
My problem is that I have "No num found" but like the example I know I should have a list.
I would parse the file into a dictionary. This will make searching much easier and will allow you to do multiple searches without having to re-read the file:
def parse_file(path):
data = {}
with open(path, 'r') as in_file:
for line in in_file:
try:
name, n, direction = line.strip().split(';')
if name not in data:
data[name] = {"Input": [], "Output": []}
data[name][direction].append(n)
except KeyError:
print(f"Error with: {line}")
except ValueError:
pass
return data
This will return a dictionary like:
{
'ALPHA': {'Input': ['n11', 'n12'], 'Output': ['n10']},
'BETA': {'Input': ['n10', 'n14'], 'Output': []},
'CHARLIE': {'Input': ['n10'], 'Output': ['n13']},
'DELTA': {'Input': ['n12'], 'Output': ['n13']}
}
With that searches can be done with a simple list comprehension:
def search_new_name(name, data):
if name not in data: return None
return [key for key,value in data.items() if any(x in data[key]["Input"] for x in data[name]["Output"])]
Sample usage:
data = parse_file(r"C:\foo\bar.txt")
print(search_new_name("ALPHA", data))
Output:
['BETA', 'CHARLIE']
You will have to read all the lines and creating a dictionary with the 'number' and 'type' combination as the key will solve the problem.
file = "path of the texte file"
name = "ALPHA"
liste_new_name = []
def search_new_name(name):
name_map = {} ## dict to save all the info
search_key = False
file_txt = open(file, "r")
all_lines = file_txt.readlines()
for contenu in all_lines:
[l_name,l_num,l_type] = contenu.split(";")
key = l_num + "_" + l_type ## use num and type combination as a key
if l_name == name and l_type == "Output":
search_key = l_num+"_"+l_type
if key in name_map:
name_map[key] = name_map[key].append(l_name)
else:
name_map[key] = [l_name]
if search_key is False:
print("Num not found")
return []
else:
search_num = search_key.split('_')[0]
if search_num+'_Input' in name_map:
return name_map[search_num+'_Input']
else:
## return empty list if no input found
return []
search_new_name(name)
I try to continue with my idea with two functions like that :
file = "path of the text file"
name = "ALPHA"
new_l_name = []
num = []
def search_num(num):
file_txt = open(file, "r")
contenu = file_txt.readline()
while contenu:
contenu = fichier_txt.readline()
if contenu.split(";")[0] == name and ";Output" in contenu:
num.append(contenu.split(";")[1]
return num
else:
print("No num found")
file_txt.close()
search_num(num)
def search_new_name(liste):
file_txt = open(file, "r")
contenu = file_txt.readline()
while contenu:
contenu = file_txt.readline()
if contenu.split(";")[1] == num[0] and ";Input" in contenu:
new_name = contenu.split(";")[0]
liste.append(new_name)
print("the list of new name : {}".format(liste))
return liste
else:
print("No new name found")
search_new_name(new_l_name)
Finally, I have the num we search in return but the list of the new name return the list with the first new name found in the textfile but not the others. It returns ["BETA"] and not ["BETA", "CHARLIE"] as we want.
If someone have an idea.
Thanks.

Remove row from the CSV file if condition met

I am trying to scrape pickels.com.au.
I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file.
What would be the best way to remove and update the row in the CSV file.
Below is my code...
import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep
with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
auction_csv_writer = csv.writer(auction_csv_file)
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('M ake')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
sale_price = auction_data.get('MinimumBid')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, "${:,.2f}".format(sale_price).strip() ,
link, sale_date
]
with open('pickels_dataset.csv', 'r+') as csv_read:
auction_reader = list(csv.reader(csv_read))
for each in auction_reader:
if link in each:
each_link, each_price = each[9], each[0]
if (link == each_link) and (sale_price != each_price):
auction_reader.clear()
print('New list found, old list deleted')
auction_csv_writer.writerow(auction_values)
print('New value added')
continue
elif (link == each[9]) and (sale_price == each[0]):
print('Same result already exist in the file')
continue
else:
auction_csv_writer.writerow(auction_values)
print('Unique result found and added.')
break
Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected.
A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction.
Next scrape the current auctions and update the saved_auctions dictionary as needed.
Finally at the end, write the contents of saved_auctions back to the CSV file.
For example:
import requests
from scrapy.selector import Selector
import csv
import re
auction_filename = 'pickels_dataset.csv'
# Load existing auctions into a dictionary with link as key
saved_auctions = {}
with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
for row in csv.reader(f_auction_file):
saved_auctions[row[9]] = row # dictionary key is link
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('Make')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
minimum_bid = auction_data.get('MinimumBid')
sale_price = "${:,.2f}".format(minimum_bid).strip()
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, sale_price,
link, sale_date
]
if link in saved_auctions:
if saved_auctions[link][8] == sale_price:
print('Same result already exists in the file')
else:
print('New value updated')
saved_auctions[link] = auction_values # Updated the entry
else:
print('New auction added')
saved_auctions[link] = auction_values
# Update the saved auction file
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
csv_auction_file = csv.writer(f_auction_file)
csv_auction_file.writerows(saved_auctions.values())
If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image

Categories