I have been trying to save a CSV file with information to add to db. And, is necessary to remove the single quotes, and ")". I already tried doing the replace but, didn't worked.
Also, I am doing this by an admin view. I add the csv file with informations to create objects on my db. And, it's from multiple tables. I don't know if I am using the right code or logic for this.
def upload_csv(self,request):
form = CSVImportForm(request.POST, request.FILES)
if request.method == "POST":
csv_file = request.FILES['csv_upload']
file_data = csv_file.read().decode("utf-8")
csv_data = file_data.split("\n")
csv_data = file_data.replace("'", "")
try :
for x in csv_data:
fields = x.split(",")
print(fields)
create_hospital = {}
create_hospital['hospital_name'] = fields[0],
create_hospital['hospital_website'] = fields[1],
create_hospital['hospital_fiscalAddress'] = fields[2],
create_hospital['hospital_shippingAddress'] = fields[3],
create_hospital['hospital_region'] = fields[4],
create_hospital['country'] = fields[5],
create_hospital['hospital_contactPerson'] = fields[6],
create_hospital['hospital_contactPhone'] = fields[7],
create_hospital['hospital_contactEmail'] = fields[8],
create_hospital['hospital_ImageLogo'] = fields[9]
created_hospital = HospitalViewRoleForUsers.objects.create(**create_hospital)
create_users = {}
create_users['FKLab_User'] = fields[0],
create_users['user_type'] = "1",
create_users['email'] = fields[11],
create_users['password'] = BaseUserManager().make_random_password(8),
create_users['name'] = fields[10],
# create_users['FKLab_User'] = created_hospital.id
# create_users['user_type'] = "1"
# create_users['email'] = fields[14],
# create_users['password'] = BaseUserManager().make_random_password(8),
# create_users['name'] = fields[13],
# create_users['FKLab_User'] = created_hospital.id
# create_users['user_type'] = "1"
# create_users['email'] = fields[17],
# create_users['password'] = BaseUserManager().make_random_password(8),
# create_users['name'] = fields[16],
created_users = CustomUser.objects.bulk_create(create_users)
create_roles = {}
create_roles['user_id'] = created_users[0],
create_roles['roles'] = fields[12],
created_roles = RoleHospital.objects.create(create_roles)
except Exception as e:
print("%s", e)
data = {"form": form}
return render(request, "admin/csv_upload.html", data)
Things are being add, except the fact that it adds with single quote and parenthesis.
e.g. : ('salkdklas',)
Thank you.
Related
I am trying to scrape pickels.com.au.
I am trying to update the pickels_dataset.csv file if the link is the same and if the price is not the same them I am removing the list and inserting the new row to the CSV file, but it doesn't remove the old entry from the CSV file.
What would be the best way to remove and update the row in the CSV file.
Below is my code...
import requests
from scrapy.selector import Selector
import csv
import re
from tqdm import tqdm
from time import sleep
with open('pickels_dataset.csv', 'a+', newline='', encoding='utf-8') as auction_csv_file:
auction_csv_writer = csv.writer(auction_csv_file)
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('M ake')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
sale_price = auction_data.get('MinimumBid')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, "${:,.2f}".format(sale_price).strip() ,
link, sale_date
]
with open('pickels_dataset.csv', 'r+') as csv_read:
auction_reader = list(csv.reader(csv_read))
for each in auction_reader:
if link in each:
each_link, each_price = each[9], each[0]
if (link == each_link) and (sale_price != each_price):
auction_reader.clear()
print('New list found, old list deleted')
auction_csv_writer.writerow(auction_values)
print('New value added')
continue
elif (link == each[9]) and (sale_price == each[0]):
print('Same result already exist in the file')
continue
else:
auction_csv_writer.writerow(auction_values)
print('Unique result found and added.')
break
Your current script is opening your auction CSV file for appending, and then whilst it is still open, attempting to open it again for reading. This is probably why it is not updating as expected.
A better approach would be to first read the entire contents of your existing saved auction file into a dictionary. The key could be the link which would then make it easy to determine if you have already seen an existing auction.
Next scrape the current auctions and update the saved_auctions dictionary as needed.
Finally at the end, write the contents of saved_auctions back to the CSV file.
For example:
import requests
from scrapy.selector import Selector
import csv
import re
auction_filename = 'pickels_dataset.csv'
# Load existing auctions into a dictionary with link as key
saved_auctions = {}
with open(auction_filename, newline='', encoding='utf-8') as f_auction_file:
for row in csv.reader(f_auction_file):
saved_auctions[row[9]] = row # dictionary key is link
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range(2010..2021).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print("NO RESULTS")
for auction_data in auctions_data:
if int(auction_data.get('MinimumBid')) > 0:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
short_title = str(auction_data.get('Year')) + ' ' + str(auction_data.get('Make')) + ' ' + str(auction_data.get('Model'))
make = auction_data.get('Make')
model = auction_data.get('Model')
variant = auction_data.get('Series')
transmission = auction_data.get('Transmission')
odometer = auction_data.get('Odometer')
state = auction_data.get('Location').get('State')
minimum_bid = auction_data.get('MinimumBid')
sale_price = "${:,.2f}".format(minimum_bid).strip()
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
sale_date = auction_data.get('SaleEndString')
auction_values = [
main_title, short_title, make,
model, variant, transmission, odometer,
state, sale_price,
link, sale_date
]
if link in saved_auctions:
if saved_auctions[link][8] == sale_price:
print('Same result already exists in the file')
else:
print('New value updated')
saved_auctions[link] = auction_values # Updated the entry
else:
print('New auction added')
saved_auctions[link] = auction_values
# Update the saved auction file
with open(auction_filename, 'w', newline='', encoding='utf-8') as f_auction_file:
csv_auction_file = csv.writer(f_auction_file)
csv_auction_file.writerows(saved_auctions.values())
If you want to also remove auctions that are no longer active, then it would probably be best to simply ignore the saved file and just write all current entries as is.
I'm creating a Django website where users (who are learning Chinese) can upload Chinese vocabulary lists and the site will return a list of just the unique characters (no duplicates) that the user can download.
It's working and doing everything I described above. But the part that I'm stuck on is that I want to add a filter functionality. I want to add the option to exclude some of the more common Chinese characters from the list that the user downloads (what I'm thinking of as a filter feature).
I have created a dropdown menu where before the user presses upload, they first say whether they want to filter out the 100 most common characters, 500, none, etc. Then, they move to another page where it should take that into account when it's writing to the file that it presents for the user to download.
The logic of what I want to achieve would be something like: if the user selects one FilterPreference, then the web app should use the corresponding file in static files to filter out the characters in that file from the final output. I am fairly sure that I would need to use sessions, but I'm not positive about how to do it properly. Currently choosing a different FilterPreference does not affect the output. Open to any suggestions.
models.py
class FilterPreference(models.Model):
NONE = 'NO'
first_250 = 'F250'
first_500 = 'F500'
first_750 = 'F750'
first_1000 = 'F1000'
PREFERENCE_CHOICES = [
(NONE, 'None'),
(first_250, 'First 250'),
(first_500, 'First 500'),
(first_750, 'First 750'),
(first_1000, 'First 1000'),
]
preference = models.CharField(
max_length=100,
choices=PREFERENCE_CHOICES,
default=NONE,
)
def __str__(self):
return self.preference
forms.py
class FilterForm(forms.ModelForm):
class Meta:
model = FilterPreference
fields = ['preference']
labels = {'preference': ''}
views.py (likely problem area)
#Determine the filter
preference = request.session.get('preference')
if preference == 'None':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter1.csv')
elif preference == 'First 250':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter2.csv')
elif preference == 'First 500':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter3.csv')
elif preference == 'First 750':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter4.csv')
else:
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter5.csv')
views.py (full)
def index(request):
if request.method != 'POST':
#No data submitted; create a blank form.
form = FilterForm()
else:
#POST data submitted; process data.
form = FilterForm(data=request.POST)
if form.is_valid():
form.save()
return HttpResponseRedirect('converter/file_upload')
preference = FilterForm.cleaned_data['preference']
request.session['preference'] = preference
context = {'form':form}
return render(request, "converter/index.html", context)
def file_upload(request):
success = 0
if success == 1:
success = 2
if request.POST and request.FILES:
txtfile = request.FILES['txt_file']
def char_isolate():
#Open and read the uploaded file
ur_text = txtfile.read().decode("utf-8")
text = []
for char in ur_text:
if char not in text:
text.append(char)
text = str(text)
#finding unique
unique = []
for char in text:
if char not in unique:
unique.append(char)
unique = str(unique)
#cleaning
import string
nopunct_unique = unique.translate(str.maketrans('', '', string.punctuation))
nodigit_unique = nopunct_unique.translate(str.maketrans('', '', string.digits))
noletter_unique = nodigit_unique.translate(str.maketrans('', '', string.ascii_letters))
nochinesepunct_unique = noletter_unique.translate({ord(c): None for c in '。;:!?,、'})
clean_unique = nochinesepunct_unique.translate({ord(c): None for c in string.whitespace})
#Determine the filter
preference = request.session.get('preference')
if preference == 'None':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter1.csv')
elif preference == 'First 250':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter2.csv')
elif preference == 'First 500':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter3.csv')
elif preference == 'First 750':
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter4.csv')
else:
file_path = os.path.join(settings.STATIC_ROOT, 'converter/filter5.csv')
#Filter out common characters
filter_file = str(file_path)
filter=set([])
for word in filter_file:
filter.add(word)
filtered = set([])
for word in clean_unique:
if word not in filter:
filtered.add(word)
#write to file
f = open("text.txt","w+")
for word in filtered:
f.write('\n'+word)
f.close()
#write to file
tmp_path = os.path.join(settings.MEDIA_ROOT, 'tmp/text.txt')
with open(tmp_path, 'w') as f:
item = iter(filtered)
for _ in range(len(filtered)-1):
f.write('%s\n' % next(item))
f.seek(0)
f.write('%s' % next(item))
f.close()
char_isolate()
success = 1
return redirect('converter:file_download')
context = {}
return render(request, "converter/file_upload.html", locals())
def download(request):
path = "tmp/text.txt"
file_path = os.path.join(settings.MEDIA_ROOT, path)
if os.path.exists(file_path):
with open(file_path, 'rb') as f:
try:
response = HttpResponse(f)
response['content_type'] = "application/octet-stream"
response['Content-Disposition'] = 'attachment; filename=' + os.path.basename(file_path)
return response
except Exception:
raise Http404
You are using session already which is the right solution for your purpose in my opinion.
The error lies in the if clause: you need to check for the first value of the choices touple"
elif preference == 'First 250':
should be
elif preference == 'F250':
import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image
I am attempting to extract a list of URLS from several files using a for loop, however this is resulting in a list of URLS from only the first file, repeated 10 times. I'm not sure what I am doing wrong. Also, I am an absolute beginner at this, so I will presume that there are much better ways of trying to achieve what I want, however this is what I have so far.
type_urls = []
y = 0
for files in cwk_dir:
while y < 10:
open('./cwkfiles/cwkfile{}.crawler.idx'.format(y))
lines = r.text.splitlines()
header_loc = 7
name_loc = lines[header_loc].find('Company Name')
type_loc = lines[header_loc].find('Form Type')
cik_loc = lines[header_loc].find('CIK')
filedate_loc = lines[header_loc].find('Date Filed')
url_loc = lines[header_loc].find('URL')
firstdata_loc = 9
for line in lines[firstdata_loc:]:
company_name = line[:type_loc].strip()
form_type = line[type_loc:cik_loc].strip()
cik = line[cik_loc:filedate_loc].strip()
file_date = line[filedate_loc:url_loc].strip()
page_url = line[url_loc:].strip()
typeandurl = (form_type, page_url)
type_urls.append(typeandurl)
y = y + 1
Here is a more Pythonic way using pathlib and Python 3:
from pathlib import Path
cwk_dir = Path('./cwkfiles')
type_urls = []
header_loc = 7
firstdata_loc = 9
for cwkfile in cwk_dir.glob('cwkfile*.crawler.idx'):
with cwkfile.open() as f:
lines = f.readlines()
name_loc = lines[header_loc].find('Company Name')
type_loc = lines[header_loc].find('Form Type')
cik_loc = lines[header_loc].find('CIK')
filedate_loc = lines[header_loc].find('Date Filed')
url_loc = lines[header_loc].find('URL')
for line in lines[firstdata_loc:]:
company_name = line[:type_loc].strip()
form_type = line[type_loc:cik_loc].strip()
cik = line[cik_loc:filedate_loc].strip()
file_date = line[filedate_loc:url_loc].strip()
page_url = line[url_loc:].strip()
type_urls.append((form_type, page_url))
If you want to test on a small batch of files, replace cwk_dir.glob('cwkfile*.crawler.idx') with cwk_dir.glob('cwkfile[0-9].crawler.idx'). That will give you the first then files if they are sequentially numbered, starting from 0.
And here is better way to put it all together and in a more readable way:
from pathlib import Path
def get_offsets(header):
return dict(
company_name = header.find('Company Name'),
form_type = header.find('Form Type'),
cik = header.find('CIK'),
file_date = header.find('Date Filed'),
page_url = header.find('URL')
)
def get_data(line, offsets):
return dict(
company_name = line[:offsets['form_type']].strip(),
form_type = line[offsets['form_type']:offsets['cik']].strip(),
cik = line[offsets['cik']:offsets['file_date']].strip(),
file_date = line[offsets['file_date']:offsets['page_url']].strip(),
page_url = line[offsets['page_url']:].strip()
)
cwk_dir = Path('./cwkfiles')
types_and_urls = []
header_line = 7
first_data_line = 9
for cwkfile in cwk_dir.glob('cwkfile*.crawler.idx'):
with cwkfile.open() as f:
lines = f.readlines()
offsets = get_offsets(lines[header_line])
for line in lines[first_data_line:]:
data = get_data(line, offsets)
types_and_urls.append((data['form_type'], data['page_url']))
When you get to the second file, the while condition fails as y is already 10.
Try setting y back to 0 just before the while loop:
for files in cwk_dir:
y = 0
while y < 10:
...
And as you're opening the file in the first line inside the while loop, you probably need to close it when exiting the loop.
I'm trying to create a new dictionary from an old one to get only the values I'll new in a df in the future. But the function I created is returning always the same key+values.
def load_data():
with open('./data/awards.json', 'r') as file:
json_file = json.load(file)
events_dict = {}
for data_raw in json_file:
events_dict['event'] = (data_raw['Event'])
#form categories_list
categories_list = []
for all_data in data_raw['Data']:
for awards_data in all_data['nomineesWidgetModel']['eventEditionSummary']['awards']:
#check if it's a premium category
if categories_data['isPremiumCategory'] == True:
for categories_data in awards_data['categories']:
categories = {}
categories['category_name'] = (categories_data['categoryName'])
#form nomination_list
nomination_list = []
for nominations_data in categories_data['nominations']:
primary_nominees = nominations_data['primaryNominees']
if len(primary_nominees)>0:
nominee1 = primary_nominees[0]['name']
secondary_nominees = nominations_data['secondaryNominees']
if len(secondary_nominees)>0:
nominee2 = secondary_nominees[0]['name']
nomination_list.append([nominee1, nominee2, nominations_data['isWinner']])
categories['nominations'] = nomination_list
categories_list.append(categories)
events_dict['categories'] = categories_list
return events_dict
My intention is to get for each award the category, nominations an if it is a winner or not. What I'm getting now is 11 times the same event.
I've tried changing the indentation, append the dict to a list first but nothing helped... I'm sure it's something pretty basic, but I'm new to Python and not seeing my error.
json file
Is it possible to do something like this?
Your logic was a bit off. You want to initialise your events_dict = {} in your loop, then you can append that into a list, which I named results.
def load_data():
with open('C:/test/awards.json', 'r') as file:
json_file = json.load(file)
results = [] #<---- initialize result list
for data_raw in json_file:
events_dict = {} #<--- here initialize your dictionary then build it
events_dict['event'] = data_raw['Event']
#form categories_list
categories_list = []
for all_data in data_raw['Data']:
for awards_data in all_data['nomineesWidgetModel']['eventEditionSummary']['awards']:
#check if it's a premium category
for categories_data in awards_data['categories']: #<---- Need to switch this line
if categories_data['isPremiumCategory'] == True: #<---This needs to follow your loop
categories = {}
categories['category_name'] = (categories_data['categoryName'])
#form nomination_list
nomination_list = []
for nominations_data in categories_data['nominations']:
primary_nominees = nominations_data['primaryNominees']
if len(primary_nominees)>0:
nominee1 = primary_nominees[0]['name']
else:
nominee1 = ''
secondary_nominees = nominations_data['secondaryNominees']
if len(secondary_nominees)>0:
nominee2 = secondary_nominees[0]['name']
else:
nominee2 = ''
nomination_list.append([nominee1, nominee2, nominations_data['isWinner']])
categories['nominations'] = nomination_list
categories_list.append(categories)
events_dict['categories'] = categories_list
results.append(events_dict) #<---- once that dictionary is filled, you can append it into that result list, then move on to make a new events_dict with the next iteration
return results