I'm extracting values from a csv file and storing these in a list.
The problem I have is that unless there is an exact match the elements/strings don't get extracted. How would I go about a case insensitive list search in Django/Python?
def csv_upload_view(request):
print('file is being uploaded')
if request.method == 'POST':
csv_file_name = request.FILES.get('file')
csv_file = request.FILES.get('file')
obj = CSV.objects.create(file_name=csv_file)
result = []
with open(obj.file_name.path, 'r') as f:
f.readline()
reader = csv.reader(f)
#reader.__next__()
for row in reader:
data = str(row).strip().split(',')
result.append(data)
transaction_id = data[1]
product = data[2]
quantity = data[3]
customer = data[4]
date = parse_date(data[5])
try:
product_obj = Product.objects.get(name__iexact=product)
except Product.DoesNotExist:
product_obj = None
print(product_obj)
return HttpResponse()
Edit:
the original code that for some reason doesn't work for me contained the following iteration:
for row in reader:
data = "".join(row)
data = data.split(';')
data.pop()
which allows to work with extracted string elements per row. The way I adopted the code storing the elements in a list (results=[]) makes it impossible to access the elements via the product models with Django.
The above mentioned data extraction iteration was from a Macbook while I'm working with a Windows 11 (wsl2 Ubuntu2204), is this the reason that the Excel data needs to be treated differently?
Edit 2:
Ok, I just found this
If your export file is destined for use on a Macintosh, you should choose the second CSV option. This option results in a CSV file where each record (each line in the file) is terminated with a carriage return, as expected by the Mac
So I guess I need to create a csv file in Mac format to make the first iteration work. Is there a way to make both csv (Windows/Mac) be treated the same? Similar to the mentioned str(row).strip().lower().split(',') suggestion?
If what you're trying to do is simply search for a string case insensitive then all you gotta do is lower the case of your search and your query (or upper).
Here's a revised code
def csv_upload_view(request):
print('file is being uploaded')
if request.method == 'POST':
csv_file_name = request.FILES.get('file')
csv_file = request.FILES.get('file')
obj = CSV.objects.create(file_name=csv_file)
result = []
with open(obj.file_name.path, 'r') as f:
f.readline()
reader = csv.reader(f)
#reader.__next__()
for row in reader:
data = str(row).strip().lower().split(',')
result.append(data)
_, transaction_id, product, quantity, customer, date, *_ = data
date = parse_date(date)
try:
product_obj = Product.objects.get(name__iexact=product)
except Product.DoesNotExist:
product_obj = None
print(product_obj)
return HttpResponse()
Then when you're trying to store the data make sure to store it lowercase.
Also, do not split a csv file on ,. Instead use the Python's CSV library to open a csv file, since the data might contain ,. Make sure to change csv.QUOTE so that it encapsulates everything with ".
Related
I'm new to Python so excuse me if my question is kind of dumb.
I send some data into a csv file (I'm making a password manager). So I send this to this file (in this order), the name of the site, the e-mail corresponding and finally the password.
But I would like to print all the names already written in the csv file but here is my problem, for the first row it does print the whole row but for the following rows it works just well.
Here is my code, I hope u can help me with this.
csv_file = csv.reader(open('mycsvfile.csv', 'r'), delimiter=';')
try :
print("Here are all the sites you saved :")
for row in csv_file :
print(row[0])
except :
print("Nothing already saved")
Maybe it can help, but here is how I wrote my data into the csv file:
#I encrypt the email and the password thanks to fernet and an already written key
#I also make sure that the email is valid
file = open('key.key', 'rb')
key = file.read()
file.close()
f = Fernet(key)
website = input("web site name : \n")
restart = True
while restart :
mail = input("Mail:\n")
a = isvalidEmail(mail)
if a == True :
print("e-mail validated")
restart = False
else :
print("Wrong e-mail")
pws = input("password :\n")
psw_bytes = psw.encode()
mail_bytes = mail.encode()
psw_encrypted_in_bytes = f.encrypt(psw_bytes)
mail_encrypted_in_bytes = f.encrypt(mail_bytes)
mail_encrypted_str = mail_encrypted_in_bytes.decode()
psw_encrypted_str = psw_encrypted_in_bytes.decode()
f = open('a.csv', 'a', newline='')
tup1 = (website, mail_encrypted_str, psw_encrypted_str)
writer = csv.writer(f, delimiter = ';')
writer.writerow(tup1)
print("Saved ;)")
f.close()
return
And here is my output (I have already saved data)
Output (First, you see the name of the ws with the email and the psw encrypted then just the name which is what I want
I finally succeed, instead of using a csv.Reader, i used a csv.DictReader and as all the names i'm looking for are on the same column, i juste have to use the title of the columns.
So here is the code :
with open('mycsv.csv', newline='') as csvfile:
data = csv.DictReader(csvfile)
print("Websites")
print("---------------------------------")
for row in data:
print(row['The_title_of_my_column'])
make list from csv.reader()
rows = [row for row in csv_file]
and now you can get element by identifier using rows as list of lists
rows[id1][id2]
The place where I put this code belongs to an import page.And here there is data in the data I want to import in .txt format, but this data contains the \n character.
if request.method == "POST":
txt_file = request.FILES['file']
if not txt_file .name.endswith('.txt'):
messages.info(request,'This is not a txt file')
data_set = csv_file.read().decode('latin-1')
io_string = io.StringIO(data_set)
next(io_string)
csv_reader = csv.reader(io_string, delimiter='\t',quotechar="|")
for column in csv_reader:
b = Module_Name(
user= request.user,
a = column[1],
b = column[2],
c = column[3],
d = column[4],
e = column[5],
f = column[6],
g = column[7],
h = column[8],
)
b.save()
messages.success(request,"Successfully Imported...")
return redirect("return:return_import")
This can be called the full version of my code. To explain, there is a \n character in the data that comes here as column[1]. This file is a .txt file from another export. And in this export column[1];
This is
a value
and my django localhost new-line character seen in unquoted field - do you need to open the file in universal-newline mode? gives a warning and aborts the import to the system.
the csv reader iterates over rows, not columns. So if you want to append the data from a given column together, you must iterate over all the rows first. For example:
import csv
from io import StringIO
io_string = "this is , r0 c1\r\na value, r1 c2\r\n"
io_string = StringIO(io_string)
rows = csv.reader(io_string)
column_0_data = []
for row in rows:
column_0_data.append(row[0])
print("".join(column_0_data))
the rest of your code looks iffy to me, but that is off topic.
What I'm looking to achieve:
The code added below filters through a parsed HTML page looking for specific values. Each specific value is then added to its own specific list in the form of a dictionary. Once all the values are added to the lists the dictionaries within are then combined into a JSON blob that I can then export.
Note - This is part of a quick PoC, so it was written quick and dirty. Forgive me.
My problem:
When the following lists dictionaries are combined I do not encounter any issues when export the blob:
jobs
names
dates
summaries
However, when the locations list is added in order to be combined into the blob an IndexError exception is encountered. As shown in the image below:
IndexError Encountered
My Analysis:
I've found that sometimes the value is not found because it was not included in the parsed HTML for reason/s that I cannot control, ie. it was not added my the user when it was created. The issue in this case being that the len of the locations list being 14 whilst the len of the other lists being equal at 15 which is causing the IndexError exception when I combine the lists using a for loop.
My Question:
As shown in my code below, I'm trying to handle the issue by assigning a placeholder value, "null", when the scraped value is not found but for some reason the value is not applied and I still encounter the IndexError exception. Any help would be appreciated, thank you in advance.
My Code:
import ast
import sys
# Create empty lists [Global]
jobs = []
names = []
dates = []
summaries = []
locations = []
# Function - Ingest parsed HTML data | Filter out required values
def getJobs(parsedHTML):
# Loop - Get job title
for div in parsedHTML.find_all(name='h2', attrs={'class':'title'}):
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
val = str(a.getText().strip())
if val is None:
locations.append({"job-title": "null"})
else:
dictItem = {"job-title": f"{val}"}
jobs.append(dictItem)
# Loop - Get job poster's name
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'company'}):
val = str(span.getText().strip())
if val is None:
locations.append({"company-name": "null"})
else:
dictItem = {"company-name": f"{val}"}
names.append(dictItem)
# Loop - Get the date the job post was created
for div in parsedHTML.find_all(name='div', attrs={'class':'result-link-bar'}):
for span in div.find_all(name='span', attrs={'class':'date date-a11y'}):
val = str(span.getText().strip())
if val is None:
locations.append({"date-created": "null"})
else:
dictItem = {"date-created": f"{val}"}
dates.append(dictItem)
# Loop - Get short job description
for divParent in parsedHTML.find_all(name='div', attrs={'class':'result'}):
for divChild in divParent.find_all(name='div', attrs={'class':'summary'}):
val = str(divChild.getText().strip())
if val is None:
locations.append({"short-description": "null"})
else:
dictItem = {"short-description": f"{val}"}
summaries.append(dictItem)
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for span in div.find_all(name='span', attrs={'class':'location'}):
val = str(span.getText().strip())
if val is None:
locations.append({"location": "null"})
else:
dictItem = {"location": f"{val}"}
locations.append(dictItem)
# Function - Generate test data
def testData(parsedHTML, typeProc):
# typeProc == True | Export data to text files
if typeProc:
#getJobs(parsedHTML)
with open("jobs.txt", "w") as file:
for line in jobs:
file.write(str(line))
file.write("\n")
file.close()
with open("names.txt", "w") as file:
for line in names:
file.write(str(line))
file.write("\n")
file.close()
with open("dates.txt", "w") as file:
for line in dates:
file.write(str(line))
file.write("\n")
file.close()
with open("summaries.txt", "w") as file:
for line in summaries:
file.write(str(line))
file.write("\n")
file.close()
with open("locations.txt", "w") as file:
for line in locations:
file.write(str(line))
file.write("\n")
file.close()
# typeProc == False | Import data from txt files, convert to dictionary and append to list
elif typeProc == False:
with open("jobs.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
jobs.append(content[i])
file.close()
with open("names.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
names.append(content[i])
file.close()
with open("dates.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
dates.append(content[i])
file.close()
with open("summaries.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
summaries.append(content[i])
file.close()
with open("locations.txt", "r") as file:
content = file.readlines()
for i in range(len(content)):
content[i] = content[i].replace("\n", "")
content[i] = ast.literal_eval(content[i])
locations.append(content[i])
file.close()
# Else | If this else is hit, something is greatly fvcked
else:
print("Function: testData | Error: if statement else output")
sys.exit(1)
# Function - Remove items from all lists
def wipeLists():
jobs.clear()
names.clear()
dates.clear()
summaries.clear()
locations.clear()
# Function - JSON Blob Generator
def genJSON(parsedHTML):
# Testing with cached local IRL data
#testData(parsedHTML, False)
getJobs(parsedHTML)
jsonBlob = []
# Merge dictionaries | Combining dictionaries into single object + Append to jsonBlob list
for i in range(len(jobs)):
sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i], **locations[i]}
#sumObj = {**jobs[i], **names[i], **dates[i], **summaries[i]}
jsonBlob.append(sumObj)
return jsonBlob
Thank You #pavel for your notes on how to approach the issue. I found that the value I was looking for was actually a required field when it was created and for some reason I was just not getting the correct amount of values when I was filtering the parsed data.
I reviewed the source code of the page/s again and found that there was another field with the exact value I was looking for. So now instead of getting the text of a span-element inside the parent div, I am getting the custom data-* attribute value of the parent div-element. I have not encountered a single error whilst testing.
Updated Code:
# Loop - Get job location
for div in parsedHTML.find_all(name='div', attrs={'class':'sjcl'}):
for divChild in div.find_all(name='div', attrs={'class':'recJobLoc'}):
dictItem = {"location": f"{divChild['data-rc-loc']}"}
locations.append(dictItem)
Thank You to everyone who tried to help. This has been resolved.
My code is currently hard coded to only accept csv files in the column format:
first_name,last_name,phone,email,address,company
However I would like users to be able to upload csv files that are in any* format order and naming scheme and correctly populate our forms. For example:
Email,LastName,FirstName,Company,Phone,Address
would be a valid column format. How would I go about doing that? Relevant code as follows:
dr = csv.reader(open(tmp_file,'r'))
data_dict = {}
headers = next(dr)
print (headers)
#skips over first line in csv
iterlines = iter(dr)
next(iterlines)
for row in iterlines:
data_dict["first_name"] = row[0]
#print(data_dict["first_name"])
data_dict["last_name"] = row[1]
#print(data_dict["last_name"])
data_dict["phone"] = row[2]
#print(data_dict["phone"])
data_dict["email"] = row[3]
#print(data_dict["email"])
data_dict["address"] = row[4]
#print(data_dict["address"])
data_dict["company"] = row[5]
#print(data_dict["company"])
#adds to form
try:
form = AddContactForm(data_dict)
if form.is_valid():
obj = form.save(commit=False)
obj.owner = request.user.username
first_name = form.cleaned_data.get(data_dict["first_name"])
last_name = form.cleaned_data.get(data_dict["last_name"])
phone = form.cleaned_data.get(data_dict["phone"])
email = form.cleaned_data.get(data_dict["email"])
address = form.cleaned_data.get(data_dict["address"])
company = form.cleaned_data.get(data_dict["company"])
obj.save()
else:
logging.getLogger("error_logger").error(form.errors.as_json())
except Exception as e:
logging.getLogger("error_logger").error(repr(e))
pass
headers = "first_name,last_name,email"
headers_array = headers.split(',')
headers_map = {}
for i, column_name in enumerate(headers_array):
headers_map[column_name] = i
#creates {'first_name': 0, 'last_name': 1, 'email': 2}
Now you can now use headers_map to get the row element
row[headers_map['first_name']]
Edit: For those loving one liners
headers_map = {column_name: i for i, column_name in enumerate(headers.split(','))}
There are a number of approaches to handling inconsistent header names in the file. The best approach is to prevent it by rejecting such files at upload time, obliging the uploader to correct them. Assuming this isn't possible, you could try to transform the provided headers into what you want
import csv
import io
import re
with open(tmp_file, 'r') as f:
reader = csv.reader
headers = next(reader)
# Make a new header list with placeholders
fixed_headers = [None * len(headers)]
for i, value in enumerate(headers)
fixed = re.sub(r'(\w+)(?<=[a-z])([A-Z]\w+)', r'\1_\2', v).lower()
new_headers[i] = fixed
The regex finds capital letters in the middle of strings and inserts an underscore; then str.lower is called on the result (so values like 'Email' will be converted to 'email'.
Now rewrite the csv with the fixed headers:
with open(tmp_file, 'r') as f:
reader = csv.reader(f)
next(reader)
new_file = io.StringIO()
writer = csv.writer(new_file)
writer.writerow(fixed_headers)
for row in reader:
writer.writerow(row)
# Rewind the file pointer
new_file.seek(0)
Use csv.DictReader to get rows as dictionaries of values mapped to headers.
dr = csv.DictReader(new_file)
for data_dict in dr:
#adds to form
try:
form = AddContactForm(data_dict)
if form.is_valid():
obj = form.save(commit=False)
obj.owner = request.user.username
first_name = form.cleaned_data.get(data_dict["first_name"])
last_name = form.cleaned_data.get(data_dict["last_name"])
phone = form.cleaned_data.get(data_dict["phone"])
email = form.cleaned_data.get(data_dict["email"])
address = form.cleaned_data.get(data_dict["address"])
company = form.cleaned_data.get(data_dict["company"])
obj.save()
else:
logging.getLogger("error_logger").error(form.errors.as_json())
except Exception as e:
logging.getLogger("error_logger").error(repr(e))
pass
My intention was to copy a piece of string after either a colon or equal sign from File 1 , and pasting that string in File 2 in a similar location after either a colon or equal sign.
For instance, if File 1 has:
username: Stack
File 2 is originally empty:
username=
I want Stack to be copied over to File 2 after username. Currently, I'm stuck and not sure what to do. The program piece I made below doesn't copy the username. I would greatly appreciate any input!
with open("C:/Users/SO//Downloads//f1.txt", "r") as f1:
with open("C:/Users/SO//Downloads//f2.txt", "r+") as f2:
searchlines = f1.readlines()
searchlines_f2=f2.readlines()
for i, line in enumerate(searchlines):
if 'username' in line:
for l in searchlines[i:i+1]:
ind = max(l.find(':'), l.find('='), 0) #finding index of specific characters
copy_string=l[ind+1:].strip() #copying string for file 2
for l in searchlines_f2[i:i+1]:
if 'username' in line:
f2.write(copy_string)
I think something like this will get you what you need in a more maintainable and Pythonic way.
Note the use of regex as well as some string methods (e.g., startswith)
import re
SOURCE_PATH = "C:/Users/SO//Downloads//f1.txt"
TARGET_PATH = "C:/Users/SO//Downloads//f2.txt"
def _get_lines(filepath):
""" read `filepath` and return a list of strings """
with open(filepath, "r+") as fh:
return fh.readlines()
def _get_value(fieldname, text):
""" parse `text` to get the value of `fieldname` """
try:
pattern = '%s[:=]{1}\s?(.*)' % fieldname
return re.match(pattern, text).group(1)
except IndexError:
# you may want to handle this differently!
return None
def _write_target(filepath, trgt_lines):
""" write `trgt_lines` to `filepath` """
with open(filepath, "w+") as fh:
fh.writelines(trgt_lines)
src_lines = _get_lines(SOURCE_PATH)
trgt_lines = _get_lines(TARGET_PATH)
# extract field values from source file
fields = ['username', 'id', 'location']
for field in fields:
value = None
for cur_src in src_lines:
if cur_src.startswith(field):
value = _get_value(field, cur_src)
break
# update target_file w/ value (if we were able to find it)
if value is not None:
for i, cur_trgt in enumerate(trgt_lines):
if cur_trgt.startswith('{0}='.format(field)):
trgt_lines[i] = '{0}={1}'.format(field, value)
break
_write_target(TARGET_PATH, trgt_lines)