Having a bit of an issue trying to figure out how to save the output of my python script as a CSV. When I run this script, the file does not appear in the location that i need in order to access it. Any suggestions?
import pandas as pd
import os
folder_path = os.path.join("T:", "04. Testing","3. Wear Testing","TESTS","CKUW","180604 OP STRAPLESS","Survey Response Data")
mapping_path = os.path.join(folder_path + r'\Survey_MappingTable Strapless.xlsx')
# Read mapping table
mapping = pd.ExcelFile(mapping_path)
mapping.sheet_names
# ['SurveyInfo', 'Question Mapping', 'Answer Mapping']
# Transform sheets to 3 tables (surveyinfo, Q_mapping, A_mapping)
surveyinfo = mapping.parse("SurveyInfo")
Q_mapping = mapping.parse("Question Mapping", skiprows = 2)
A_mapping = mapping.parse("Answer Mapping", skiprows = 3)
# Get input file name and read the data. Table name is df.
input_file_name = surveyinfo.loc[surveyinfo['Parameter Name']=='Input File Name','Value'].to_string(index=False)
path = os.path.join(r'T:\04. Testing\3. Wear Testing\TESTS\CKUW\180604 OP STRAPLESS\Survey Response Data',input_file_name)
df = pd.read_csv(path,header=None,engine='python')
# ,encoding='utf-8' Tried this as a way to fix but it didn't work
# Fill in previous colunmn names if blank, using the preceeding header
df.iloc[0] = df.iloc[0].fillna(method='ffill')
# Read the count of columns
n_col = len(df.iloc[0])
n_respondent = len(df)-2
c_name = []
for i in range(n_col):
# Multiple columns; each columns with differnt single answer. and the question text is to combine the category ex. support, comfort, are both in the satisfaction category etc.
# If it's satisfaction question, concatenate first row and second row
if "satisfaction" in df.iloc[0][i]:
c_name.append(df.iloc[0][i]+df.iloc[1][i])
elif "functionality" in df.iloc[0][i]:
c_name.append(df.iloc[0][i]+df.iloc[1][i])
elif ("shape" in df.iloc[0][i]) and ("please specify" in df.iloc[1][i]):
c_name.append(df.iloc[0][i]+df.iloc[1][i])
elif ("room in the cup" in df.iloc[0][i]) and ("please specify" in df.iloc[1][i]):
c_name.append(df.iloc[0][i]+df.iloc[1][i])
# - in the column header which is part of the question and part of the response
elif ("wire" in df.iloc[0][i]) and ("Response" not in df.iloc[1][i]):
if "-" in df.iloc[1][i]:
c_name.append(df.iloc[0][i]+df.iloc[1][i][df.iloc[1][i].find("-")+2:])
else:
c_name.append(df.iloc[0][i]+df.iloc[1][i])
for j in range(n_respondent):
if pd.notnull(df.iloc[j+2,i]) and "please specify" not in df.iloc[1,i]:
df.iloc[j+2,i] = df.iloc[1,i][:df.iloc[1][i].find("-")-1]
# Multiple columns; each columns with differnt single answer. and the question text is not to combine the category.
# Use to combine band and cup size
elif "size bra do you typically wear?" in df.iloc[0][i]:
c_name.append(df.iloc[0][i])
for j in range(n_respondent):
if pd.notnull(df.iloc[j+2,i]):
df.iloc[j+2,i] = df.iloc[1,i] + df.iloc[j+2,i]
# Single answer to the question; or multiple answers to the question but the answer is the same as the column header
else:
c_name.append(df.iloc[0][i])
# Make the column names as the first row
df.columns = c_name
# Drop the first and second rows
df2 = df.drop(df.index[[0,1]])
# Transform the wide dataset to a long dataset;
r = list(range(10))+list(range(17,20)) # skipping "What size bra do you typically wear? (only select one size)"
df_long = pd.melt(df2,id_vars = list(df.columns[r]), var_name = 'Question', value_name = 'Answer')
# Delete rows with null value to answer
df_long_notnull = df_long[pd.notnull(df_long['Answer'])]
# Make typically wear as a column dimension
sizewear = df_long_notnull.loc[df_long_notnull['Question'] == 'What size bra do you typically wear? (Only select one size)']
sizewear2 = sizewear[['Respondent ID','Collector ID','Email Address','Answer']]
sizewear2.columns = ['Respondent ID','Collector ID','Email Address','What size bra do you typically wear?']
df_long_notnull2 = df_long_notnull[df_long_notnull['Question'] != 'What size bra do you typically wear? (Only select one size)']
df_final = pd.merge(df_long_notnull2, sizewear2, how='left', on=['Respondent ID','Collector ID','Email Address'])
# Join Answer description mapping table
df_full = pd.merge(df_final, A_mapping, how='left', left_on = ['Question','Answer'], right_on = ['Question','Answer Description'])
df_full.loc[df_full['Answer_y'].isnull(),'Answer_y'] = df_full['Answer_x']
df_full.loc[df_full['Answer Description'].isnull(),'Answer Description'] = df_full['Answer_x']
df_full = df_full.drop(labels = ['Answer_x'], axis=1)
df_full = df_full.rename(columns = {'Answer_y':'Answer','Answer Description':'Answer Desc'})
# Join Question Mapping table
df_full = pd.merge(df_full,Q_mapping, how='left', left_on = ['Question'], right_on = ['Raw Column Name'])
df_full = df_full.drop(labels = ['Raw Column Name'], axis=1)
# Get Survey Info
product_name = surveyinfo.loc[surveyinfo['Parameter Name']=='Product Name','Value'].to_string(index=False)
if "," in surveyinfo.loc[surveyinfo['Parameter Name']=='Style Number','Value'].item():
style_number = surveyinfo.loc[surveyinfo['Parameter Name']=='Style Number','Value'].to_string(index=False).split(',')
style_number = [s.strip() for s in style_number]
else:
style_number = surveyinfo.loc[surveyinfo['Parameter Name']=='Style Number','Value'].to_string(index=False)
if "," in surveyinfo.loc[surveyinfo['Parameter Name']=='Style Name','Value'].item():
style_name = surveyinfo.loc[surveyinfo['Parameter Name']=='Style Name','Value'].to_string(index=False).split(',')
style_name = [s.strip() for s in style_name]
else:
style_name = surveyinfo.loc[surveyinfo['Parameter Name']=='Style Name','Value'].to_string(index=False)
# get survey information
survey_name = surveyinfo.loc[surveyinfo['Parameter Name']=='Survey Name','Value'].to_string(index=False)
survey_id = surveyinfo.loc[surveyinfo['Parameter Name']=='Survey ID','Value'].item()
survey_year = surveyinfo.loc[surveyinfo['Parameter Name']=='Survey Year','Value'].item()
survey_mo = surveyinfo.loc[surveyinfo['Parameter Name']=='Survey Month','Value'].item()
output_file_name = surveyinfo.loc[surveyinfo['Parameter Name']=='Output File Name','Value'].to_string(index=False)
# adding columns for survey information
df_full['Product Name'] = product_name
df_full['Survey Name'] = survey_name
df_full['Survey ID'] = survey_id
df_full['Survey Year'] = survey_year
df_full['Survey Month'] = survey_mo
### create a table with style_number and style_name
if type(style_name) == list:
style_t = pd.DataFrame(list(zip(style_name, style_number)), columns = list(["Style_Name","Style_Number"]))
df_full = pd.merge(df_full, style_t, how='left', left_on = ['Which style did you receive?'], right_on = ['Style_Name'])
else:
df_full['Style Name'] = style_name
df_full['Style Number'] = style_number
# Identify the path for saving output file
path_out = os.path.join("C:","Users","Sali3",output_file_name)
# Save as comma separated csv file
df_full.to_csv(path_out, sep=',', index = False)
The last portion of this script here is where i am having a problem. The path_out should be on my local "C" drive as a CSV file. Please help.
Assuming you are on Windows, the documentation on os.path.join says:
On Windows, the drive letter is not reset when an absolute path component (e.g., r'\foo') is encountered. If a component contains a drive letter, all previous components are thrown away and the drive letter is reset. Note that since there is a current directory for each drive, os.path.join("c:", "foo") represents a path relative to the current directory on drive C: (c:foo), not c:\foo.
This should fix your problem:
path_out = os.path.join("C:\\","Users","Sali3",output_file_name)
Related
Learning Pandas and trying to make a code for myself so noob in this.
I'm running into an error:
ValueError: cannot set a row with mismatched columns
I have an app created in PySimpleGui which would take input from user and create a database with columns predefined.
I'm unable to add the rows and running into this issue.
I've made sure I have 6 columns and therefore the number of data to be inserted is 6 but I still get this error.
The code I have at the moment is :
output_path_csv = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop\\Daily_Tracker.xlsx')
if os.path.exists(output_path_csv):
pass
else:
x_header = ['Date', 'Case Number', 'Serial Number', 'Product Name', 'Version', 'Issue']
df = pd.DataFrame( columns=x_header )
df.to_excel( output_path_csv, index=False)
p_name = prod_name(values['Serial_Num']) # Determining the product name
values['Product_Name'] = p_name
v_name = ver_name(values['Serial_Num']) # Determining the version type of the product selected
values['Product_Type'] = v_name
c_num = values.get('Case_Num')
s_num = values.get('Serial_Num')
c_Issue = values.get('Issue')
# to check for e-mail address
sp_char = "#"
if sp_char in values['Email_Add']:
doc.render(values)
output_path_doc = os.path.join(os.path.join(os.environ['USERPROFILE']), 'Desktop\\Notes.docx')
try:
doc.save(output_path_doc)
popup("File Saved", f"File has been saved here : {output_path_doc}")
os.startfile(output_path_doc)
except PermissionError:
popup('File seems to be opened! Please close the file!')
pass
else:
popup('Missing # character in the e-mail. Please update')
pass
df2 = pd.read_excel( output_path_csv )
df2.loc[len( df2.index )] = [to_date, c_num, s_num, p_name, v_name, c_Issue]
df2.to_excel( output_path_csv )
I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one
I have two excel tables:
old_data.xlsx
Product number Name Current price Other columns
1000 Product name 1 10
AB23104 Product name 2 5
430267 Product name 3 20
new_data.xlsx
Product number Name New price Other columns
AB23104 Renamed product name 2 20
1000 Renamed product name 1 5
345LKT10023 Product name 4 100
Expected result: table below + 2 feedback messages somewhere
Message 1) Product ID 430267 is missing in new data file
Message 2) Product ID 345LKT10023 is newly added
Product ID Name of product New price Old price
AB23104 Product name 2 20 5
1000 Product name 1 5 10
345LKT10023 Product name 4 100 100
I have this code for now, but it is not working and not finished due to lack of knowledge on my part:
import openpyxl
import pandas as pd
new_datacols = [0, 1, 2]
old_datacols = [0, 1, 2]
new_data = pd.read_excel('new_data.xlsx', skiprows=1, usecols=new_datacols, index_col=0)
old_data = pd.read_excel('old_data.xlsx', skiprows=1, usecols=old_datacols, index_col=0)
def format_data():
# combine_type = inner, left, right, outer
df = pd.merge(new_data, old_data, on='Product number', how='outer')
df = df.rename(columns={"Product number": "Product ID",
"Name": "Name of product",
"Current price": "Old price"})
nan_value = float("NaN")
df.replace("", nan_value, inplace=True)
df.dropna(subset=["Name of product"], inplace=True)
df = df[['Product ID', 'Name of product',
'New price', 'Old price']]
print(df.columns)
# df.to_excel('updated_table.xlsx')
if __name__ == "__main__":
format_data()
This is my attempt. It puts the messages in another sheet in the same file. The final spreadsheet looks like this:
import os
import pandas as pd
old_data_filename = r"old_data.xlsx"
new_data_filename = r"new_data.xlsx"
new_spreadsheet_filename = r"updated_products.xlsx"
# Load spreadsheets into a dataframe and set their indexes to "Product number"
old_data_df = pd.read_excel(old_data_filename).set_index("Product number")
new_data_df = pd.read_excel(new_data_filename).set_index("Product number")
# Determine which products are new/missing, and store the corresponding
# messages in a list, which will be written to its own spreadsheet at the end
old_data_products = set(old_data_df.index)
new_data_products = set(new_data_df.index)
new_products = new_data_products - old_data_products
missing_products = old_data_products - new_data_products
messages = [f"Product ID {product} is missing in new data file" for product in missing_products]
messages.extend(f"Product ID {product} is newly added" for product in new_products)
messages = [f"Message {i}) {message}" for i, message in enumerate(messages, start=1)]
# Keep the original product names
new_data_df.update(old_data_df["Name"])
# Old price is the same as new price unless the product is in old_data_df, in which
# case it is old_data_df["Current price"]
new_data_df["Old price"] = new_data_df["New price"]
new_data_df["Old price"].update(old_data_df["Current price"])
# Rename the columns
new_data_df.reset_index(inplace=True)
new_data_df.rename(columns={"Product number": "Product ID",
"Name": "Name of product"}, inplace=True)
# Remove all other columns except the ones we want
new_data_df = new_data_df[["Product ID",
"Name of product",
"New price", "Old price"]]
# Write the new products and messages to separate sheets in the same file
with pd.ExcelWriter(new_spreadsheet_filename) as writer:
new_data_df.to_excel(writer, "Products", index=False)
pd.DataFrame({"Messages": messages}).to_excel(writer, "Messages", index=False)
# Launch the new spreadsheet
os.startfile(new_spreadsheet_filename)
EDIT: Code that works with the actual spreadsheets:
import os
import pandas as pd
old_data_filename = r"old_data.xlsx"
new_data_filename = r"new_data.xlsx"
new_spreadsheet_filename = r"updated_products.xlsx"
# Load spreadsheets into a dataframe and set their indexes to "Product number"
old_data_df = pd.read_excel(old_data_filename).set_index("Product ID")
new_data_df = pd.read_excel(new_data_filename).set_index("Product ID")
# Remove duplicated indexes for both the dataframes, keeping only the first occurrence
old_data_df = old_data_df[~old_data_df.index.duplicated()]
new_data_df = new_data_df[~new_data_df.index.duplicated()]
# Determine which products are new/missing, and store the corresponding
# messages in a list, which will be written to its own spreadsheet at the end
old_data_products = set(old_data_df.index)
new_data_products = set(new_data_df.index)
new_products = new_data_products - old_data_products
missing_products = old_data_products - new_data_products
messages = [f"Product ID {product} is missing in new data file" for product in missing_products]
messages.extend(f"Product ID {product} is newly added" for product in new_products)
messages = [f"Message {i}) {message}" for i, message in enumerate(messages, start=1)]
# Keep the original product names
new_data_df.update(old_data_df["Name"])
# Old price is the same as new price unless the product is in old_data_df, in which
# case it is old_data_df["Current price"]
new_data_df["Old price"] = new_data_df["New price"]
new_data_df["Old price"].update(old_data_df["Current price"])
# Rename the "Name" column to "Name of product"
new_data_df.rename(columns={"Name": "Name of product"}, inplace=True)
# Remove all other columns except the ones we want
new_data_df.reset_index(inplace=True)
new_data_df = new_data_df[["Product ID",
"Name of product",
"New price", "Old price"]]
# Write the new products and messages to separate sheets in the same file
with pd.ExcelWriter(new_spreadsheet_filename) as writer:
new_data_df.to_excel(writer, "Products", index=False)
pd.DataFrame({"Messages": messages}).to_excel(writer, "Messages", index=False)
# Launch the new spreadsheet
os.startfile(new_spreadsheet_filename)
I have created a code to get users of my platform based on 2 things:
choiceTitle: search for a specific word contained in the title of an Ad that users of my platform have looked at. For eg, the Ad is "We are offering free Gin" and I want to get the word 'Gin'
PrimaryTagPreviousChoice: the Ad has a "Food and Drink" tag
I can get those users who are interested in Gin and Food and Drink with:
(df2['choiceTitle'].str.contains("(?i)Gin")) & (df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
What I'd like to do is create a function with all my code inside (hence the sql query, the rename operation, the sort_values operation etc....) and then use the INPUT function. So I'll just have to run my code, so that python will ask me 2 questions:
choiceTitle? ... Gin
PrimaryTagPreviousChoice? ...Food and Drink.
I enter the 2 options and it gives me the users interested in, let's say, Gin and Food and Drink.
How can I do it?
MY CODE:
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)Gin")) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)Food and Drink"))
]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
Adding a function in Python is simple. Just use the def keyword to declare the function and put your existing code under it (indented). Put parameters in the parenthesis.
Here is the updated code:
def GetUsers (title, tag)
df = pd.read_sql_query(""" select etc..... """, con)
df1 = pd.read_sql_query(""" select etc..... """, con)
df1['user_id'] = df1['user_id'].apply(str)
df2 = pd.merge(df, df1, left_on='user_id', right_on='user_id', how='left')
tag = df2[
(df2['choiceTitle'].str.contains("(?i)" + title)) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)" + tag))]
dw = tag[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
dw = dw.sort_values(['last_login'], ascending=[False])
dw = dw[dw.last_login > dt.datetime.now() - pd.to_timedelta("30day")]
dw = dw.rename({'user': 'user full name', 'title': 'user title'}
, axis='columns')
dw.drop_duplicates(subset ="Email",
keep = 'first', inplace = True)
return dw # send back to print statement
# get input from user
inpTitle = input ("choiceTitle? ")
inpTag = input ("PrimaryTagPreviousChoice? ")
# run function
result = GetUsers (inpTitle, inpTag)
print(result)
Try this. Save your input() as variables and use string concatenation to edit your mask. Note that an additional set of {} is needed for escaping.
choiceTitle = input('choiceTitle?')
PrimaryTagPreviousChoice = input('PrimaryTagPreviousChoice?')
mask = df2[(df2['choiceTitle'].str.contains("(?i){{0}}".format(choiceTitle))) &
(df2['PrimaryTagPreviousChoice'].str.contains("(?i)
{{0}}".format(PrimaryTagPreviousChoice)))]
dw = mask[['user', 'title', 'user_category', 'email', 'last_login',
'PrimaryTagPreviousChoice', 'choiceTitle'
]].drop_duplicates()
....
I have 2 tables on Excel:
.
I've created an excel Pivot Table using Python but I could not find a simple way to create a calculated field inside it (like I would do with VB) which matches Region from left table and Region from right table.
So I did this, using the module win32com.client:
First, stored the content of the tables in two lists : myTable and myRates.
Then, added a new column to the original left table where I calculated CA * (1 + rate). The code here:
calField = [['CA Bonifié']] #first element as a title for the new column :
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
What it does in the sheet "source":
What it does in the created sheet "TCD":
Result is ok but I don't like this method as it alterates the original table. So I'm looking a simplest method to do that.
Thanks in advance for your help
PS : The whole code below. May it help.
import win32com.client
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application')
win32c = win32com.client.constants
Excel.Visible = True
wb = Excel.Workbooks.Open('C:/Users/Documents/Python/classeur.xlsx')
Sheet1 = wb.Worksheets('Source')
def getContiguousRange(fichier, sheet, row, col):
bottom = row
while sheet.Cells(bottom + 1, col).Value not in [None, '']:
bottom = bottom + 1
right = col
while sheet.Cells(row, right + 1).Value not in [None, '']:
right = right + 1
return sheet.Range(sheet.Cells(row, col), sheet.Cells(bottom, right)).Value
myTable = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 1)
myRates = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 8)
calField = [['CA Bonifié']]
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
cl1 = Sheet1.Cells(1,1)
cl2 = Sheet1.Cells(len(myTable),len(myTable[0])+1)
pivotSourceRange = Sheet1.Range(cl1,cl2)
pivotSourceRange.Select()
Sheet2 = wb.Sheets.Add (After=wb.Sheets (1))
Sheet2.Name = 'TCD'
cl3=Sheet2.Cells(4,1)
pivotTargetRange= Sheet2.Range(cl3,cl3)
pivotTableName = 'tableauCroisé'
pivotCache = wb.PivotCaches().Create(SourceType=win32c.xlDatabase, SourceData=pivotSourceRange, Version=win32c.xlPivotTableVersion14)
pivotTable = pivotCache.CreatePivotTable(TableDestination=pivotTargetRange, TableName=pivotTableName, DefaultVersion=win32c.xlPivotTableVersion14)
pivotTable.PivotFields('Service').Orientation = win32c.xlRowField
pivotTable.PivotFields('Service').Position = 1
pivotTable.PivotFields('Region').Orientation = win32c.xlPageField
pivotTable.PivotFields('Region').Position = 1
pivotTable.PivotFields('Region').CurrentPage = 'IDF'
dataField = pivotTable.AddDataField(pivotTable.PivotFields('CA'))
dataField.NumberFormat = '# ### €'
calculField = pivotTable.AddDataField(pivotTable.PivotFields('CA Bonifié'))
calculField.NumberFormat = '# ### €'
# wb.SaveCopyAs('C:/Users/Documents/Python/tcd.xlsx')
# wb.Close(True)
# Excel.Application.Quit()
Note: I'm using Sheet1 as the Image show all relevant indices and its easier to verify.
You can move the Formula to the PivotTabel at a later Step, once verified.
STEP Replace Column E with the Formula =VLOOKUP
Reference: how-to-use-vlookup-match
Replace the following in your Code:
for row, testDataRow in enumerate(calField, 2):
#Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
Sheet1.Cells(row, 5).Formula = '=VLOOKUP(A{}, H1:I5, MATCH(H1,H1:I1))'.format(row)
The Result should show the matching Taux!
Come back and confirm Results are OK!
STEP Compute Taux