Could not scrap some values from json file in python

Could not scrap some values from json file in python - python

I would like scrap the data from json file, however I could not scrap the availability ("available" in json file) of the json value. The other values are scrapped sucessfully.
It shown blank on the column.
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
import asyncio
import os
import random
import time
import openpyxl
import aiohttp
from urllib import request
# path="C:/Users/pengoul/Downloads/dl"
path = os.getcwd()
print(f"CWD is {path}")
path = os.path.join(path, "download")
if not os.path.exists(path):
os.makedirs(path)
# picpath= os.makedirs('picture')
async def request():
async with aiohttp.ClientSession() as session:
async with session.get(url='https://hiutdenim.co.uk/products.json?limit=500') as resp:
html = await resp.json()
k = list()
f = openpyxl.Workbook()
sheet = f.active
sheet.append(['Name', 'Barcode', 'Product Category', 'Image', 'Internal Reference', 'Sales Price','Product Tags'])
products = []
print("Saving to excel ...")
for i in html['products']:
title = i.get('title')
id1 = i.get('id')
product_type = i.get('product_type')
images = [img.get('src', '') for img in i.get('images', [])]
products.append((title, id1, product_type, images))
variants = [var for var in i.get('variants')]
for i in range(max(len(images), len(variants))):
imgsrc = "" if i >= len(images) else images[i]
varsku = "" if i >= len(variants) else variants[i].get('sku', '')
varprice = "" if i >= len(variants) else variants[i].get('price', '')
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
sheet.append([title, "'" + str(id1), product_type, imgsrc, varsku, varprice, varavailability])
f.save(f"result230102.xlsx")
print("Downloading images ...")
for product in products:
title, id1, product_type, images = product
for seq, imgurl in enumerate(images):
print(f"Downloading img for {id1} ({seq + 1}/{len(images)})")
request.urlretrieve(imgurl, os.path.join(path, f"{id1}-{seq + 1}.jpg"))
async def download(url):
image = url[0]
file_name = f'{url[1]}.jpg'
print(f'picpath/{file_name}')
async with aiohttp.ClientSession() as session:
time.sleep(random.random())
async with session.get(image) as resp:
with open(path+ file_name, mode='wb') as f:
f.write(await resp.content.read())
# print(f'picpath/{file_name}')
async def main():
if not os.path.exists(path):
os.mkdir(path)
tasks = []
await request()
# for url in urls:
# tasks.append(asyncio.create_task(download(url)))
# await asyncio.wait(tasks)
if __name__ == '__main__':
print(os.getpid())
t1 = time.time()
urls = []
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
t2 = time.time()
print('total：', t2 - t1)
It shown blank on this column.
I would like to scrap the values of "available" from json.

I ran your code in my debugger, putting a breakpoint at the line in question. This breakpoint is hit many times during execution. In some cases, it produces a True value for varavailability as you're expecting.
At some point, this line ends up executing when the value of i is 1 and the length of variants is also 1. In this case, per the if condition if i >= len(variants), the variable varavailability is set to "". i is allowed to have a value of 1 because the length of images in this case is 5. In this case, your loop for i in range(max(len(images), len(variants))): will iterate over i == 0 to i == 4. For each i value greater than 0, varavailability will be set to "". I can't be sure if this is the case you're wondering about, but it makes good sense that it is.
UPDATE:
As to how to fix this, the question centers on how the contents of variants and images relate to each other and on what you are doing in your loop:
for i in range(max(len(images), len(variants))):
imgsrc = "" if i >= len(images) else images[i]
varsku = "" if i >= len(variants) else variants[i].get('sku', '')
varprice = "" if i >= len(variants) else variants[i].get('price', '')
varavailability= "" if i >= len(variants) else variants[i].get('available', '')
sheet.append([title, "'" + str(id1), product_type, imgsrc, varsku, varprice, varavailability])
It seems that the code is iterating over a list of products, and each product has two lists associated with it, a list of images, and a list of variants. My guess is that the contents of these two lists are independent...that each value in images does not correspond to a particular entry in variants.
If what you want is a table of product variants, one possible solution is to associate all of the images for a particular product with each of the variations of that product, and then just iterate over each of the variants. That could be something like this:
imgsrc = " ".join(images)
for variant in variants:
varsku = variants.get('sku', '')
varprice = variants.get('price', '')
varavailability = variants.get('available', '')
sheet.append([title, "'" + str(id1), product_type, imgsrc, varsku, varprice, varavailability])

Related

my loop requests is wrong, I can't find the error to write in blob

[enter image description here][1]
I need to make the requests and then save it to the blob, but my loop continues and always saves the last requests to the filename.
while True:
for Chunks in chunkID:
url = f"https://####/2/0/workspaces/{workspaceID}/models/{modelID}/files/{fileID}/chunks/" + str(Chunks)
urlsTotais.append(url)
for Chunkss in chunkID:
filename = '/mnt/blob/Brazil/Finance/Fpna/API/Anaplan/'+sistema+'/' + str(now.year) +'/'+ str(now.month).zfill(2) +'/'+ str(now.day).zfill(2)+'/'+'file'+ str(Chunkss) +'.parquet'
BlobPreLandingZonePathTest.append(filename)
for RequestsUrl in urlsTotais:
response08 = requests.get(RequestsUrl, headers=headers_oc)
print('complete:{}'.format(RequestsUrl))
#time.sleep(3)
if response08.status_code == 200 and response08 != None:
x = response08.text
df = pd.read_csv(io.StringIO(x), sep=',',header = 9,low_memory=False)
df = df.T.reset_index(drop=True).T
dff = spark.createDataFrame(df)
** for name in BlobPreLandingZonePathTest:
toblob= dff.write.mode('overwrite').parquet(name)
print(name)**
else:
print('Found {} CHUNKS!!'.format(Chunks))
break
I need to make the requests and then save it to the blob, but my loop continues and always saves the last requests to the filename.
ERROR:
complete:https://####/2/0/workspaces/8a81b0116d9dab55016e225c95726bc6/models/5365B94E805C44A3AD4564591AD29CFC/files/116000000036/chunks/0
/mnt/blob/########/2022/12/17/file0.parquet
/mnt/blob/########/2022/12/17/file1.parquet
/mnt/blob/########/2022/12/17/file2.parquet
I JUST NEED ONE LIKE THIS:
complete:https://####/2/0/workspaces/8a81b0116d9dab55016e225c95726bc6/models/5365B94E805C44A3AD4564591AD29CFC/files/116000000036/chunks/0
/mnt/blob/########/2022/12/17/file0.parquet
complete:https://####/2/0/workspaces/8a81b0116d9dab55016e225c95726bc6/models/5365B94E805C44A3AD4564591AD29CFC/files/116000000036/chunks/1
/mnt/blob/########/2022/12/17/file1.parquet

Remove row from the CSV file if condition met

I am trying to scrape pickels.com.au.
I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file.
What would be the best way to remove and update the row in the CSV file.
Below is my code...
import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep
with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
auction_csv_writer = csv.writer(auction_csv_file)
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('M ake')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
sale_price = auction_data.get('MinimumBid')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, "${:,.2f}".format(sale_price).strip() ,
link, sale_date
]
with open('pickels_dataset.csv', 'r+') as csv_read:
auction_reader = list(csv.reader(csv_read))
for each in auction_reader:
if link in each:
each_link, each_price = each[9], each[0]
if (link == each_link) and (sale_price != each_price):
auction_reader.clear()
print('New list found, old list deleted')
auction_csv_writer.writerow(auction_values)
print('New value added')
continue
elif (link == each[9]) and (sale_price == each[0]):
print('Same result already exist in the file')
continue
else:
auction_csv_writer.writerow(auction_values)
print('Unique result found and added.')
break

Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected.
A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction.
Next scrape the current auctions and update the saved_auctions dictionary as needed.
Finally at the end, write the contents of saved_auctions back to the CSV file.
For example:
import requests
from scrapy.selector import Selector
import csv
import re
auction_filename = 'pickels_dataset.csv'
# Load existing auctions into a dictionary with link as key
saved_auctions = {}
with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
for row in csv.reader(f_auction_file):
saved_auctions[row[9]] = row # dictionary key is link
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('Make')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
minimum_bid = auction_data.get('MinimumBid')
sale_price = "${:,.2f}".format(minimum_bid).strip()
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, sale_price,
link, sale_date
]
if link in saved_auctions:
if saved_auctions[link][8] == sale_price:
print('Same result already exists in the file')
else:
print('New value updated')
saved_auctions[link] = auction_values # Updated the entry
else:
print('New auction added')
saved_auctions[link] = auction_values
# Update the saved auction file
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
csv_auction_file = csv.writer(f_auction_file)
csv_auction_file.writerows(saved_auctions.values())
If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image

Why does my python script with sleep in infinite loop stop running?

I'm working on a python script to transfer data from an .xlsx file to a html: I read/parse the excel with pandas and use beautifulsoup to edit the html (reading the paths to these two files from two .txt's). This, on its own, works. However, this script has to run constantly so everything is called in an infinite while that loops every 15 minutes, each time messages being displayed on the console.
My problem is the following: for some reason, after an aleatoric number of loops, the code just doesn't run anymore, and by that I mean no text on the console and no changes in the html file. When this happens, I have to rerun it in order to get it to function again.
Here is the main function:
def mainFunction():
if getattr(sys, 'frozen', False):
application_path = os.path.dirname(sys.executable)
elif __file__:
application_path = os.path.dirname(__file__)
excelFiles = open(str(application_path) +"\\pathsToExcels.txt")
htmlFiles = open(str(application_path) +"\\pathsToHTMLs.txt")
sheetFiles = open(str(application_path) +"\\sheetNames.txt")
print("Reading file paths ...")
linesEx = excelFiles.readlines()
linesHtml = htmlFiles.readlines()
linesSheet = sheetFiles.readlines()
print("Begining transfer")
for i in range (len(linesEx)):
excel = linesEx[i].strip()
html = linesHtml[i].strip()
sheet = linesSheet[i].strip()
print("Transfering data for " + sheet)
updater = UpdateHtml(excel, sheet, str(application_path) + "\\pageTemplate.html", html)
updater.refreshTable()
updater.addData()
updater.saveHtml()
print("Transfer done")
excelFiles.close()
htmlFiles.close()
sheetFiles.close()
UpdateHtml is the one actually responsible for the data transfer.
The "__main__" which also contains the while loop:
if __name__ == "__main__":
while(True):
print("Update at " + str(datetime.now()))
mainFunction()
print("Next update in 15 minutes\n")
time.sleep(900)
And finally, the batch code that launches this
python "C:\Users\Me\PythonScripts\excelToHtmlTransfer.py"
pause
From what I've noticed through trials, this situation doesn't occur when sleep is set to under 5 minutes (still happens for 5 minutes) or if it's omitted altogether.
Does anyone have any clue why this might be happening? Or any alternatives to sleep in this context?
EDIT: UpdateHtml:
import pandas as pd
from bs4 import BeautifulSoup
class UpdateHtml:
def __init__(self, pathToExcel, sheetName, pathToHtml, pathToFinalHtml):
with open(pathToHtml, "r") as htmlFile:
self.soup = BeautifulSoup(htmlFile.read(), features="html.parser")
self.df = pd.read_excel (pathToExcel, sheet_name=sheetName)
self.html = pathToFinalHtml
self.sheet = sheetName
def refreshTable(self):
#deletes the inner html of all table cells
for i in range(0, 9):
td = self.soup.find(id = 'ok' + str(i))
td.string = ''
td = self.soup.find(id = 'acc' + str(i))
td.string = ''
td = self.soup.find(id = 'nok' + str(i))
td.string = ''
td = self.soup.find(id = 'problem' + str(i))
td.string = ''
def prepareData(self):
#changes the names of columns according to their data
counter = 0
column_names = {}
for column in self.df.columns:
if 'OK' == str(self.df[column].values[6]):
column_names[self.df.columns[counter]] = 'ok'
elif 'Acumulate' == str(self.df[column].values[6]):
column_names[self.df.columns[counter]] = 'acc'
elif 'NOK' == str(self.df[column].values[6]):
column_names[self.df.columns[counter]] = 'nok'
elif 'Problem Description' == str(self.df[column].values[7]):
column_names[self.df.columns[counter]] = 'prob'
counter += 1
self.df.rename(columns = column_names, inplace=True)
def saveHtml(self):
with open(self.html, "w") as htmlFile:
htmlFile.write(self.soup.prettify())
def addData(self):
groupCounter = 0
index = 0
self.prepareData()
for i in range(8, 40):
#Check if we have a valid value in the ok column
if pd.notna(self.df['ok'].values[i]) and str(self.df['ok'].values[i]) != "0":
td = self.soup.find(id = 'ok' + str(index))
td.string = str(self.df['ok'].values[i])
#Check if we have a valid value in the accumulate column
if pd.notna(self.df['acc'].values[i]) and str(self.df['acc'].values[i]) != "0":
td = self.soup.find(id = 'acc' + str(index))
td.string = str(self.df['acc'].values[i])
#Check if we have a valid value in the nok column
if pd.notna(self.df['nok'].values[i]) and str(self.df['nok'].values[i]) != "0":
td = self.soup.find(id = 'nok' + str(index))
td.string = str(self.df['nok'].values[i])
#Check if we have a valid value in the problem column
if pd.notna(self.df['prob'].values[i]):
td = self.soup.find(id = 'problem' + str(index))
td.string = str(self.df['prob'].values[i])
if groupCounter == 3:
index += 1
groupCounter = 0
else:
groupCounter += 1
The excel I'm working with is a bit strange hence why I perform so many (seemingly) redundant operations. Still, it has to remain in its current form.
The main thing is the fact that the 'rows' that contain data is actually formed out of 4 regular rows, hence the need for groupCounter.

Found a workaround for this problem. Basically what I did was move the loop in the batch script, as so:
:whileLoop
python "C:\Users\Me\PythonScripts\excelToHtmlTransfer.py"
timeout /t 900 /nobreak
goto :whileLoop
After leaving it to run for a few hours the situation didn't occur anymore, however unfortunately I still don't know what caused it.

Python implementation with github

How I can fix the following bug for the comments from commits that has been merged are not downloaded? I think there is a problem from REST to GraphQL but I don't know exactly how to fix.
The following script is used for:
Get all all quantified activities from a set of GitHub repositories. The
list of repos to be searched for are found in the repos.lst file.
Authentication to GitHub is also stored in separate file called github.token
containing the username and password/access token on two separate lines.
Quantified activities include merged pull-requests, closed issues (except for
those explicitly removed from the list for not being constructive) and comments.
Pull-requests:
Only the closed pull-requests are listed, and their merge status determined,
finally the merged pull-requests are stored in a JSON file, with entries
containing the reference number, the repository, the title, the author and the
pull-request URL.
Issues:
Only the closed issues are listed, the pull-requests (which are treated also as
issues by the GitHub API) removed from them, isseues blacklisted in the
blacklist.lst file are also removed from the list, finally the remaining
issues are stored in a JSON file, with entries containing the reference number,
the repository, the title, the author and the issue URL.
Comments:
Comments from the commits, from the issues and from the pull-requests are all
listed and stored in JSON file with entries containing the author, the comment
ID, the repository, the comment content and the comment's URL. Issues comments
and pull-request comments will also contain the reference number of the issue
respectively the pull-request, with issues additionally having also the
original author of the issue, while the commit comments will contain the SHA1 of
the commit.
#!/usr/bin/env python3
# requests library is used to access GitHub API
import requests
import json
import re
import sys
import os.path
import bisect
import getpass
from itertools import chain
import repos
year = repos.get_year()
path = '../../{}/rezultate/'.format(year)
# read the list of repositories
repo_name = re.compile(r'([\w-]+)/([\w-]+)')
repos = [('etc-so', 'syllabus')]
with open(path + 'data/repos.lst', 'r') as f:
for s in f:
s = s.strip()
# ignore empty lines
if s and not s.startswith('#'):
m = repo_name.fullmatch(s)
if not m:
print("Invalid repo name: " + s)
repos.append((m.group(1), m.group(2)))
# read the list of students
students = []
with open(path + 'data/students.json', 'r') as f:
students = [x['user'] for x in json.load(f)]
if not students:
print("No students to check for")
sys.exit(1)
# get the access token
if os.path.exists('github.token'):
with open('github.token', 'r') as f:
auth = (f.readline().strip(), f.readline().strip())
else:
auth = (input("Enter GitHub username: "), getpass.getpass('Password: '))
# construct a labels list, so it can be added to the comments as well
issue_labels = {}
# get the persistent PR, issue and comment lists
if os.path.exists(path + 'data/pulls.json'):
with open(path + 'data/pulls.json', 'r') as f:
saved_pulls = dict((x['id'],(x['week'], x['value'], x['author']) + ((x['multi'],) if 'multi' in x else ())) for x in json.load(f))
else:
saved_pulls = {}
if os.path.exists(path + 'data/issues.json'):
with open(path + 'data/issues.json', 'r') as f:
saved_issues = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_issues = {}
if os.path.exists(path + 'data/comments.json'):
with open(path + 'data/comments.json', 'r') as f:
saved_comments = dict((x['id'],(x['week'], x['value'])) for x in json.load(f))
else:
saved_comments = {}
current_week = 0
# if there were already some activity then move ahead to the last week
if saved_pulls or saved_issues or saved_comments:
current_week = max(x[0] for x in chain(saved_pulls.values(),
saved_issues.values(),
saved_comments.values()))
# if there is a command line argument use it as current week
if len(sys.argv) == 2:
if sys.argv[1].isdigit():
current_week = int(sys.argv[1])
# -n increments the current week if it can be obtained from the activity
elif sys.argv[1] == '-n' and current_week != 0:
current_week += 1
print("Switching to week {}".format(current_week))
elif sys.argv[1] == '?':
print("Current week is {}".format(current_week))
sys.exit(0)
else:
print("Invalid command line parameter")
sys.exit(1)
elif len(sys.argv) > 2:
print("Too many parameters")
sys.exit(1)
# if no current week was obtained, start with week 1
if not current_week:
current_week = 1
api_url = "https://api.github.com/"
get_url = "repos/{}/{}/{}"
c_params = {
'state': 'closed', # get closed pull-requests/issues
'sort': 'created',
'direction': 'asc', # sort it in ascending order by their creation time
}
pr = []
pr_times = []
gi = []
comments = []
c_times = []
authors = {}
issue_comment_re = re.compile(r'.*/(pull|issues)/([0-9]+)#.*')
link_rel = re.compile(r'<([^>]+)>; rel="([\w]+)"(?:, )?')
request_count = 0
def github_get(get_code, **params):
global request_count
global raw_data
res = []
url = api_url + get_url.format(org, repo, get_code)
while url:
r = requests.get(url, auth=auth, params=params)
if r.status_code != 200:
print('Status code {} received'.format(r.status_code))
print(r.content)
sys.exit(1)
links = dict((m[1],m[0]) for m in link_rel.findall(r.headers.get('link', '')))
url = links.get('next')
res.extend(r.json())
request_count = r.headers['X-RateLimit-Remaining']
return res
label_values = {
'bug': 2,
'enhancement': 2,
'challenge': 2,
'help wanted': 1,
'invalid': -1,
'duplicate': -2
}
def label_value(labels):
# check predefined values
values = [label_values[x] for x in labels if x in label_values]
values += [x.count('★') for x in labels]
if values:
if min(values) < 0:
return min(values)
value = max(values)
else:
value = 0
# add all white stars
value += sum(x.count('☆') for x in labels)
return value
def issue_value(value, issue_id=None):
if issue_id:
old_value = saved_issues.get(issue_id)
if old_value and old_value[1] != value:
print("Value change detected in issue {}".format(issue_id))
return value
def pull_value(label, pull_id=None, ref=0):
if label:
value = label_value(label)
else:
value = None
print("No label for pull-request {}".format(ref))
if pull_id:
old_value = saved_pulls.get(pull_id)
if old_value and old_value[1] is not None and old_value[1] != value:
print("Value change detected in pull-request {}".format(ref))
if value is None:
value = old_value[1]
return value
ptotal = 0
itotal = 0
ctotal = 0
# pass through the repos
for org,repo in repos:
print('Processing repo {}:'.format(repo), end=' ', flush=True)
# get all the issues, do this first as it all includes the pull-requests
# for which only here we can determine the labels
issues = github_get('issues', state='all', sort='created', direction='asc')
icount = 0;
for i in issues:
ref = int(i['number'])
author = i['user']['login']
authors[(repo, ref)] = author
label = [x['name'] for x in i['labels']]
issue_labels[(repo, ref)] = label
if 'pull_request' not in i and author in students:
gi.append(
{
'id': i['id'],
'repo': repo,
'ref': ref,
'title': i['title'],
'url': i['html_url'],
'author': author,
'label': label,
'week' : saved_issues.get(i['id'], [current_week])[0],
'value' : issue_value(label_value(label), i['id'])
})
icount += 1
# get the merged pull-requests
pulls = github_get('pulls', state='closed')
pcount = 0;
#print(r.headers)
for p in pulls:
ref = int(p['number'])
author = p['user']['login']
label = issue_labels.get((repo, ref), [])
if p['merged_at'] and author in students and (not label or label_value(label) >= 0):
i = bisect.bisect_right(pr_times, p['merged_at'])
pr_times.insert(i, p['merged_at'])
# check authors of included commits
if p['id'] in saved_pulls:
author = saved_pulls[p['id']][2]
else:
pcommits = github_get('pulls/{}/commits'.format(ref))
pc_authors = [author]
for a in pcommits:
al = a['author'].get('login') if a['author'] else None
cl = a['committer'].get('login') if a['committer'] else None
if al == cl or cl == 'web-flow' or cl == 'kcs':
aa = al
else:
aa = ':'.join(x for x in [al, cl] if x)
if aa and aa not in pc_authors and aa != 'kcs':
pc_authors.append(aa)
if len(pc_authors) != 1:
author = pc_authors
pr.insert(i, {
'id': p['id'],
'repo': repo,
'ref': ref,
'title': p['title'],
'url': p['html_url'],
'label': label,
'author': author,
'week': saved_pulls.get(p['id'], [current_week])[0],
'value': pull_value(label, p['id'], ref)
})
if len(saved_pulls.get(p['id'], [])) > 3:
pr[i]['multi'] = saved_pulls[p['id']][3]
pcount += 1
# and now for the comments:
# this is more troublesome as constructive comments must be selected
# manually, so we are keeping persistent JSON file for the comments
# holding a valid tag, newly downloaded comments have this tag unset
# and they can be validated afterwards manually (or by script)
ccount = 0;
# 3 types of comments exist on GitHub: issue comments, review comments
# and commit comments, all have to be handled separately
ccomments = github_get('comments', per_page=50, sort='created')
#with open('all_ccomments.json', 'w') as f:
# json.dump(ccomments, f, indent=4, sort_keys=True)
for c in ccomments:
author = c['user']['login']
if author in students:
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'commit': c['commit_id'],
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
icomments = github_get('issues/comments', per_page=50, sort='created')
for c in icomments:
author = c['user']['login']
if author in students:
url = c['html_url']
m = issue_comment_re.fullmatch(url)
if not m:
print("Problem parsing issue url " + url)
sys.exit(1)
ref = int(m.group(2))
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'issue': ref,
'msg': c['body'],
'url': url,
'author': author,
'issue_author': authors[(repo, ref)],
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
if m.group(1) == 'issues' and (repo, ref) in issue_labels:
comments[i]['label'] = issue_labels[(repo, ref)]
ccount += 1
pcomments = github_get('pulls/comments', per_page=50, sort='created')
for c in pcomments:
author = c['user']['login']
if author in students:
ref = int(c['pull_request_url'].rsplit('/', 1)[1])
i = bisect.bisect_right(c_times, c['created_at'])
c_times.insert(i, c['created_at'])
comments.insert(i,
{
'id': c['id'],
'repo': repo,
'pull': ref,
'msg': c['body'],
'url': c['html_url'],
'author': author,
'week': saved_comments.get(c['id'], [current_week])[0],
'value': saved_comments[c['id']][1] if c['id'] in saved_comments else None
})
ccount += 1
print('found {} merged pull-requests, {} issues and {} comments'.format(pcount, icount, ccount))
ptotal += pcount
itotal += icount
ctotal += ccount
with open(path + 'data/pulls.json', 'w') as f:
json.dump(pr, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/issues.json', 'w') as f:
json.dump(gi, f, indent=4, sort_keys=True, ensure_ascii=False)
with open(path + 'data/comments.json', 'w') as f:
json.dump(comments, f, indent=4, sort_keys=True, ensure_ascii=False)
print("Total: {} merged pull-requests, {} issues and {} comments [{} contributions]".format(
ptotal, itotal, ctotal, ptotal + itotal + ctotal))
print("Remaining request count: {}".format(request_count))
repo_key = dict((r[1],i) for i,r in enumerate(repos))
def sort_repos(x):
'''Sort the repos in a blacklist with (repo,ref) structure in the order
as they appear in the repos list.
If repo is not in the list then put them afterwards
'''
if x[0] in repo_key:
return (repo_key[x[0]],x[1])
else:
return (len(repos),) + x

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Could not scrap some values from json file in python - python

Related

my loop requests is wrong, I can't find the error to write in blob

Remove row from the CSV file if condition met

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

Why does my python script with sleep in infinite loop stop running?

Python implementation with github

Categories

Resources