Basic text mining. I am getting a UicodeencoderError - python

import codecs
import csv
data = csv.reader(codecs.open("2019VAERSData.csv", "r", "latin1"))
keys = next(data)
serious_keys = list(map(lambda key: keys.index(key), ["DISABLE", "DIED", "ER_VISIT", "HOSPITAL"]))
writer = csv.writer(open("2019-vaers-serious.csv", "w"))
keys += ["SERIOUS"]
writer.writerow(keys)
for row in data:
is_serious = False
for key in serious_keys:
if row[key] == "Y":
is_serious = True
row += ["Y" if is_serious else "N"]
writer.writerow(row)

Related

Sorting and coloring excel file via python

import pandas as pd
from fuzzywuzzy import fuzz
import openpyxl
from itertools import combinations
from difflib import SequenceMatcher
import json
def readExcel():
df = pd.read_excel('tedarikci.xlsx')
# Tüm isimleri bir Python listesine ata
isimler = df['Müşteri/tedarikçi ismi'].tolist() #names
# output_list = []
# İsimleri karşılaştır
def searchInDict(dataDict,item):
if item in dataDict.keys():
return False
else:
for parent in dataDict.keys():
if item in dataDict[parent]:
return False
return True
benzerIsimler = {} #similarNames
print(isimler) #names
counter = 0
for i,item in enumerate(isimler):
print(i)
if searchInDict(benzerIsimler,item):
print(item)
benzerIsimler[item] = []
for j,item2 in enumerate(isimler):
if item != item2:
score = SequenceMatcher(None, item, item2).ratio()
if score > 0.9:
benzerIsimler[item].append(item2)
for i in benzerIsimler.keys():
print(benzerIsimler[i])
with open("dataMusteri.json", "w") as outfile:
json.dump(benzerIsimler, outfile)
def printExcel():
benzerIsimler = None
with open("dataMusteri.json", "r") as outfile:
benzerIsimler = json.load(outfile)
dataframe = openpyxl.load_workbook("tedarikci.xlsx")
# Define variable to read sheet
dataframe1 = dataframe.active
sortedRows = []
for item in benzerIsimler.keys():
print(item)
for row in dataframe1.iter_rows():
if row[0].value == item :
sortedRows.append(row)
break
for item2 in benzerIsimler[item]:
for row in dataframe1.iter_rows():
if row[0].value == item :
sortedRows.append(row)
break
print(sortedRows)
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["Müşteri/tedarikçi ismi","E-posta","Müşteri/tedarikçi grubu","TL Bakiye ", "USD Bakiye", "EUR Bakiye", "GBP Bakiye", "Adres", "İl", "İlçe", "Telefon", "Faks", "Vergi dairesi", "Vergi numarası/TC kimlik no"])
for row in sortedRows:
row_with_values = [cell.value for cell in row]
ws.append(row_with_values)
wb.save("dilara.xlsx")
printExcel()
I am trying to sort the similar names on an excel file and after sorting I want to color the similar/same ones with the same color.
The code above is what I've done so far. I can sort but I can't paint the same/similar rows, my code paints all rows with the same color. Do you have any suggestion?

Convert JSON file to flat table

Good Afternoon,
I am still pretty new to Python but have found it particularly addicting, but there are def some "quirks" to python that have been a pain to get through. I am currently trying to take a JSON file and flatten it out into a table. There are a ton of posts, specifically here on stack overflow on converting it into a flat dict, but that doesn't allow me to convert it into a table. This has been way harder than I expected.
I am currently getting the following error, which to me seems like it is something wrong with my key generation portion.
Code:
import json
import os
import csv
import copy
from pandas.io.json._normalize import nested_to_record
#Basic Veriables
scriptDirectory = os.path.dirname(os.path.realpath(__file__))
def getKeys(dictionary:dict, result:list = None, parentKey='', sep='.',skipParent = False) -> list:
if result == None:
result = []
#Loop through all keeys and return unique options
for key in dictionary.keys():
newKey = (parentKey + sep + key if parentKey else str(key))
if type(dictionary[key]) == dict:
result = (getKeys(dictionary[key],result=result,parentKey = ('' if skipParent else str(newKey))))
else:
if key != "" and newKey not in result:
result.append(newKey)
return result
def convertKey(data:str,languageDict:dict):
try:
return languageDict[data]
except KeyError:
return data
#Read the JSON files.
#Library File
with open(os.path.join(scriptDirectory,'inventoryItem.json'), "r",encoding='utf-8') as read_file:
lib = json.load(read_file)
#English Dictionary
with open(os.path.join(scriptDirectory,'en.json'), "r",encoding='utf-8') as read_file:
en = json.load(read_file)
for key in lib['inventoryItem'].keys():
print(key)
dictTemplate = dict.fromkeys(getKeys(dictionary=lib['inventoryItem'][key],skipParent = True),None)
print(dictTemplate)
firstItem = 0
try:
with open(os.path.join(scriptDirectory,'export',f"{key}.csv"),"w", newline='', encoding='utf-8') as csvfile:
for item in lib['inventoryItem'][key]:
entry = copy.deepcopy(dictTemplate)
entry.update(nested_to_record(lib['inventoryItem'][key][item], sep='.'))
if key == 'coin':
entry['name'] = convertKey(data = f"LIB_COIN_NAME_{entry['id']}",languageDict=en)
entry['description'] = convertKey(data = f"LIB_COIN_DESC_{entry['id']}",languageDict=en)
entry['obtainNavigatorData.not_enough_message'] = convertKey(data = entry['obtainNavigatorData.not_enough_message'],languageDict=en)
entry['obtainNavigatorData.not_enough_title'] = convertKey(data = entry['obtainNavigatorData.not_enough_title'],languageDict=en)
entry['obtainNavigatorData.button_label'] = convertKey(data = entry['obtainNavigatorData.button_label'],languageDict=en)
elif key == 'consumable':
#print(dictTemplate)
pass
elif key == 'scroll':
del entry["fragmentMergeCost"]
del entry["fragmentSellCost"]
del entry["fragmentBuyCost"]
del entry["buyCost"]
if firstItem == 0:
firstItem += 1
writer =csv.DictWriter(csvfile, fieldnames=entry.keys(),delimiter = ';')
writer.writeheader()
writer.writerow(entry)
except IOError:
print("I/O error")
Data set thats giving me trouble.
"consumable": {
"96": {
"id": 96,
"rewardType": "",
"rewardAmount": 0,
"effectDescription": {
"": ""
},
"buyCost": null,
"sellCost": null,
"buySpecialCost": null,
"assetAtlas": 4,
"assetTexture": "social_vk",
"iconAssetTexture": "",
"color": 1,
"hidden": 0,
"descLocaleId": "PLAY_AT_HOME_TICKET",
"obtainNavigatorData": null
},
Desired Output.
This is from a section called "coins", but the example data is from "consumables". Certain sections work fine, but others cause issues because the value is set to "","".
Finally figured out the solution. For anyone who wants to properly convert a JSON file to a table, this is the only solution I could come up with :)
import json
import os
import csv
import copy
def processJSON(initialDict:dict, createTemplate:bool = False, existingKeys:dict = None, parentKey:str = None, sep:str ='.', skipParent:bool = False) -> dict:
outPut = (existingKeys if existingKeys else {})
#Loop through all keeys and return unique options
for key in initialDict.keys():
#Create needed variables
keyTitle = str(parentKey + sep + key if parentKey else key)
#Loop
if type(initialDict[key]) == dict:
parentTitle = ('' if skipParent else keyTitle)
outPut.update(processJSON(initialDict=initialDict[key], existingKeys = outPut, createTemplate = createTemplate, parentKey = parentTitle, sep = sep))
elif keyTitle not in outPut.keys():
keyValue = str('' if createTemplate else initialDict[key])
outPut[keyTitle] = keyValue
return dict(outPut)
def convertKey(data:str,languageDict:dict):
try:
return languageDict[data]
except KeyError:
return data
#Basic Veriables
scriptDirectory = os.path.dirname(os.path.realpath(__file__))
#Read the JSON files.
#Library File
with open(os.path.join(scriptDirectory,'inventoryItem.json'), "r",encoding='utf-8') as read_file:
lib = json.load(read_file)
#English Dictionary
with open(os.path.join(scriptDirectory,'en.json'), "r",encoding='utf-8') as read_file:
en = json.load(read_file)
for key in lib['inventoryItem'].keys():
firstItem = True
header = processJSON(initialDict=lib['inventoryItem'][key], createTemplate=True, skipParent=True)
try:
with open(os.path.join(scriptDirectory,'export',f"{key}.csv"),"w", newline='', encoding='utf-8') as csvfile:
for item in lib['inventoryItem'][key]:
#Copy Header because Python sucks
row = dict(copy.deepcopy(header))
row.update(processJSON(initialDict=lib['inventoryItem'][key][item]))
if key == 'coin':
row['name'] = convertKey(data = f"LIB_COIN_NAME_{row['id']}",languageDict=en)
row['description'] = convertKey(data = f"LIB_COIN_DESC_{row['id']}",languageDict=en)
row['obtainNavigatorData.not_enough_message'] = convertKey(data = row['obtainNavigatorData.not_enough_message'],languageDict=en)
row['obtainNavigatorData.not_enough_title'] = convertKey(data = row['obtainNavigatorData.not_enough_title'],languageDict=en)
row['obtainNavigatorData.button_label'] = convertKey(data = row['obtainNavigatorData.button_label'],languageDict=en)
elif key == 'consumable':
row['name'] = convertKey(data = f"LIB_CONSUMABLE_NAME_{row['id']}",languageDict=en)
row['description'] = convertKey(data = f"LIB_CONSUMABLE_DESC_{row['descLocaleId']}",languageDict=en)
row['obtainNavigatorData.button_label'] = convertKey(data = row['obtainNavigatorData.button_label'],languageDict=en)
row['obtainNavigatorData.not_enough_message'] = convertKey(data = row['obtainNavigatorData.not_enough_message'],languageDict=en)
row['obtainNavigatorData.not_enough_title'] = convertKey(data = row['obtainNavigatorData.not_enough_title'],languageDict=en)
elif key == 'gear':
row['name'] = convertKey(data = f"LIB_GEAR_NAME_{row['id']}",languageDict=en)
elif key == 'petGear':
row['name'] = convertKey(data = f"LIB_PET_GEAR_NAME_{row['id']}",languageDict=en)
elif key == 'pseudo':
row['name'] = convertKey(data = f"LIB_PSEUDO_{row['constName']}",languageDict=en)
row['description'] = convertKey(data = f"LIB_PSEUDO_DESC_{row['id']}",languageDict=en)
elif key == 'scroll':
row['name'] = convertKey(data = f"LIB_SCROLL_NAME_{row['id']}",languageDict=en)
del row["fragmentMergeCost"]
del row["fragmentSellCost"]
del row["fragmentBuyCost"]
del row["buyCost"]
else:
print(key)
if firstItem:
firstItem = False
writer = csv.DictWriter(csvfile, fieldnames=row.keys(),delimiter = ',')
writer.writeheader()
writer.writerow(row)
except IOError:
print("I/O error")

Why is my csv file only writing one line?

import csv
def write_to_dictionaries_to_csv(csvWriter,lst_dic,lst_keys):
for dic in data:
lst = []
for key in lst_keys:
if key in dic:
value = dic[key]
lst.append(value)
return lst
data = [{'tow_reason': 'IL', 'tow_date': '2013-06-18'}, {'tow_date': '2014-09-25', 'tow_reason': 'GA'}]
with open("smallDataFileIWrote.csv", 'w') as f_out:
csv_w = csv.writer(f_out)
result = write_to_dictionaries_to_csv(csv_w, data, ['tow_reason','tow_date'])
csv_w.writerow(result)
Why is this code only writing:
IL,2013-06-18
to the file?
I want the file to have both:
IL, 2013-06-18
GA, 2014-09-25
written to the file what am I doing wrong?
You are reinitializing the lst every time in the loop and return inside the loop.
Move it out:
def write_to_dictionaries_to_csv(csvWriter,lst_dic,lst_keys):
lst = []
for dic in data:
row = []
for key in lst_keys:
if key in dic:
value = dic[key]
row.append(value)
lst.append(row)
return lst
For the writing:
result = write_to_dictionaries_to_csv(csv_w, data, ['tow_reason','tow_date'])
for row in result:
csv_w.writerow(row)
Final code:
import csv
def write_to_dictionaries_to_csv(lst_keys):
lst = []
for dic in data:
row = []
for key in lst_keys:
if key in dic:
value = dic[key]
row.append(value)
lst.append(row)
return lst
data = [{'tow_reason': 'IL', 'tow_date': '2013-06-18'},
{'tow_date': '2014-09-25', 'tow_reason': 'GA'}]
with open('smallDataFileIWrote.csv', 'w', newline='\n', encoding='utf-8') as f_out:
csv_w = csv.writer(f_out)
result = write_to_dictionaries_to_csv(['tow_reason', 'tow_date'])
for row in result:
csv_w.writerow(row)
P/s: Your code is quite ugly. Try removing unnecessary parts/variables and naming variables more meaningful.
Your lst is being empty, because it is inside loop. Try this
import csv
def write_to_dictionaries_to_csv(csvWriter,lst_dic,lst_keys):
lst = []
for dic in data:
for key in lst_keys:
if key in dic:
value = dic[key]
lst.append(value)
return lst
data = [{'tow_reason': 'IL', 'tow_date': '2013-06-18'}, {'tow_date': '2014-09-25', 'tow_reason': 'GA'}]
with open("smallDataFileIWrote.csv", 'w') as f_out:
csv_w = csv.writer(f_out)
result = write_to_dictionaries_to_csv(csv_w, data, ['tow_reason','tow_date'])
csv_w.writerow(result)

Many CSV files (workbooks) are generated.I want them as a single CSV file ( under one single workbook in a single sheet).( in web crawling)

In my code many CSV files are being generated. I want them as a single csv file. The five parties of all years has to be executed under one csv file. Basically I am trying to do web crawling and trying to create a time series of polls for those (CDU, SPD, FDP, GRUNEN, LEFT)parties.
import scrapy
import re
import csv
class VoteSpider(scrapy.Spider):
name = 'VoteSpider'
start_urls = ['https://www.wahlrecht.de/umfragen/forsa.htm']
def __init__(self):
self.CDU = {}
self.SPD = {}
self.FDP = {}
self.Green = {}
self.left = {}
def parse(self, response):
regex = r"[forsa]+[\/]+[0-9]+.htm"
tableBody = response.xpath('//*[#class="wilko"]//tbody')
hxs = scrapy.Selector(response)
all_links = hxs.xpath('*//a/#href').extract()
yearLinks = []
for link in all_links:
matches = re.search(regex, link, re.MULTILINE)
if matches:
yearLinks.append(link)
for link in yearLinks:
newlink = "https://www.wahlrecht.de/umfragen/"+ link
yield scrapy.Request(url = newlink, callback=self.parseLink, meta={'name':link})
self.parseTable(tableBody)
def parseTable(self,tableBody):
CDU= []
SPD = []
FDP= []
Green= []
left= []
rows = tableBody.xpath('//tr')
del rows[:5]
for row in rows:
CDU.append(row.xpath('td//text()')[2].extract())
SPD.append(row.xpath('td//text()')[3].extract())
Green.append(row.xpath('td//text()')[4].extract())
FDP.append(row.xpath('td//text()')[5].extract())
left.append(row.xpath('td//text()')[6].extract())
with open('CDU'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('SPD'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('left'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('Green'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('FDP'+'Current'+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
self.CDU['Current'] = []
self.SPD['Current'] = []
self.Green['Current'] = []
self.FDP['Current'] = []
self.left['Current'] = []
self.CDU['Current'].append(CDU)
self.SPD['Current'].append(SPD)
self.Green['Current'].append(Green)
self.FDP['Current'].append(FDP)
self.left['Current'].append(left)
def parseLink(self, response):
CDU= []
SPD = []
FDP= []
Green= []
left= []
name = response.meta.get('name')
yearNumber = re.findall('\d+',name)[0]
x = 0
if yearNumber == '2007':
x = 4
elif yearNumber == '1998':
x = 3
elif yearNumber == '1999':
x = 3
elif yearNumber == '2000':
x = 3
elif yearNumber == '2001':
x = 3
elif yearNumber == '2002':
x = 3
elif yearNumber == '2003':
x = 3
elif yearNumber == '2004':
x = 3
elif yearNumber == '2005':
x = 5
elif yearNumber == '2006':
x = 3
elif yearNumber == '2008':
x = 4
elif yearNumber == '2013':
x = 4
tableBody = response.xpath('//*[#class="wilko"]//tbody')
rows = tableBody.xpath('//tr')
del rows[:x]
for row in rows:
CDU.append(row.xpath('td//text()')[2].extract())
SPD.append(row.xpath('td//text()')[3].extract())
Green.append(row.xpath('td//text()')[4].extract())
# print(row.xpath('td//text()').extract())
FDP.append(row.xpath('td//text()')[5].extract())
left.append(row.xpath('td//text()')[6].extract())
with open('CDU'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('SPD'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('left'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('Green'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
with open('FDP'+yearNumber+'.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
self.CDU[yearNumber]= []
self.SPD[yearNumber] = []
self.Green[yearNumber] = []
self.FDP[yearNumber] = []
self.left[yearNumber] = []
self.CDU[yearNumber].append(CDU)
self.SPD[yearNumber].append(SPD)
self.Green[yearNumber].append(Green)
self.FDP[yearNumber].append(FDP)
self.left[yearNumber].append(left)
I want the expected output to be as, all CDU, SPD, GRUNEN, FDP, LEFT parties of all years under one CSV file
Instead of opening multiple files, you can append to a single file, like so:
...
with open('ALL'+yearNumber+'.csv', 'a+') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(CDU)
writer.writerows(SPD)
writer.writerows(left)
writer.writerows(Green)
writer.writerows(FDP)
...

My function to extract totals is exhausting my input file for future reading

The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt

Categories