Retype csv file in python - python

I got a csv file with two headers, and I don't know how to express it either, I pasted it and this is what it looks like, I need to reorder it to be a normal csv file,No information in "age" key,I just want to retrieve "name" and "age",I need to output "first_name","last_name","age". And use "first_name","last_name","age" as the title,
"ID","meta_key","meta_data"
1,"nickname","dale ganger"
2,"first_name","ganger"
3,"last_name","dale"
4,"age",
5,"sex","F"
6,"nickname","dale ganger"
7,"first_name","ganger"
8,"last_name","dale"
9,"age",
10,"sex","F"
11,"nickname","dale ganger"
12,"first_name","ganger"
13,"last_name","dale"
14,"age",
15,"sex","F"
I used this code, but it doesn't merge the headers,
import pandas as pd
pd.read_csv('input.csv', header=None).T.to_csv('output.csv', header=False, index=False)
output
ID,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
meta_key,nickname,first_name,last_name,age,sex,nickname,first_name,last_name,age,sex,nickname,first_name,last_name,age,sex
meta_data,dale ganger,ganger,dale,,F,dale ganger,ganger,dale,,F,dale ganger,ganger,dale,,F
The final look I want
nickname,first_name,last_name,age,sex
dale ganger,ganger,dale,,F
dale ganger,ganger,dale,,F
dale ganger,ganger,dale,,F

try:
df_csv = pd.DataFrame('data.csv')
df = df_csv.drop('ID', axis=1).transpose()
df.columns = df.iloc[0]
df = df.iloc[1: , :].reset_index(drop=True)
df = df[['first_name', 'last_name', 'age']]
to replicate everything:
import pandas as pd
data = {'ID': [1, 2, 3, 4, 5],
'meta_key': ['nickname', 'first_name', 'last_name', 'age', 'sex'],
'meta_data': ['dale ganger', 'ganger', 'dale', '', 'F']}
df_csv = pd.DataFrame(data)
print(df_csv) # before change
df = df_csv.drop('ID', axis=1).transpose()
df.columns = df.iloc[0] # or use below
# df.columns = ['nickname', 'first_name', 'last_name', 'age', 'sex']
df = df.iloc[1: , :].reset_index(drop=True)
df = df[['first_name', 'last_name', 'age']]
print(df) # after change
output is:
first_name
last_name
age
0
ganger
dale
I see you changed your question, now the data is repeating every 5 rows. Then I would do this below:
df = pd.read_csv('unstructured.csv')
# create a dictionary to store the data each iteration
data_dict = {'nickname': [], 'first_name': [], 'last_name': [], 'age': [], 'sex': []}
for i in range(0, len(df), 5):
data_dict['nickname'].append(df['meta_data'][i])
data_dict['first_name'].append(df['meta_data'][i+1])
data_dict['last_name'].append(df['meta_data'][i+2])
data_dict['age'].append(df['meta_data'][i+3])
data_dict['sex'].append(df['meta_data'][i+4])
new_df = pd.DataFrame(data_dict)
print(new_df)

Related

Python groupby output

I'm trying to take a spreadsheet input and display only the urls that are CMS related (wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla). I'm trying to get the output to be "technologies" and then the "url" associated (grouped) to those urls.
Link to data file
output
Code is:
import pandas as pd
import numpy as np
dataset="ASD-example.xlsx"
term_cms = 'wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla'
df = pd.read_excel((dataset), sheet_name="HTTPX")
df['technology_count'] = df.groupby('technologies')['url'].transform('count')
df.drop(['timestamp', 'request', 'response-header', 'scheme', 'port', 'body-sha256','header-sha256', 'a', 'cnames', 'input', 'location', 'error', 'response-body', 'content-type','method', 'host', 'content-length', 'chain-status-codes', 'status-code', 'tls-grab', 'csp', 'vhost','websocket', 'pipeline', 'http2', 'cdn', 'response-time', 'chain', 'final-url', 'failed','favicon-mmh3', 'lines', 'words','path','webserver'],inplace=True,axis=1)
df[df['technologies'].str.contains(term_cms, na=False)]
pivot1 = pd.pivot_table(df, index=['technologies', 'url'], columns=None, fill_value=0)
print(pivot1)
I cleaned your code a bit to make it more readable to get the output you want.
term_cms = ["wordpress", "drupal", "joomla"]
# remove square brackets and lowercase all names
df['technologies'] = df['technologies'].str.strip('[]')
df['technologies'] = df['technologies'].str.lower()
# include only needed technologies
mask = df['technologies'].isin(term_cms)
df = df[mask]
# groupby and count
df = df.groupby(['technologies', 'url']).size().reset_index(name='technology_count')
Output:
technologies URL technology_count
0 joomla https://testcom123. 1
1 Wordpress https://test.com:443 1

How do I load multiple Excel files (containing multiple sheets) into a DataFrame?

Below is the code that I currently have, but I cannot figure out what I'm doing wrong. Do I need to change each of the "with ... as xls" to something else such as "with ... as xls1", "with ... as xls2", "with ... as xls3", etc.?
def load_ticket_():
import xlrd
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
with pd.ExcelFile('AnnArbor-TicketViolation2015.xls') as xls:
columns = ['Ticket #', 'Badge', 'Issue Date', 'IssueTime', 'Plate', 'State', 'Make', '', 'Violation', 'Description', 'Location', 'Meter', 'Fine', 'Penalty']
df2015_1 = pd.read_excel(xls, 'Sheet1', skiprows=0, header=1)
df2015_1.columns = columns
df2015_2 = pd.read_excel(xls, 'Sheet2', skiprows=0)
df2015_3 = pd.read_excel(xls, 'Sheet3', skiprows=0, skipfooter=1)
sheets2015 = [df2015_1, df2015_2, df2015_3]
df2015 = pd.concat(sheets2015)
with pd.ExcelFile('AnnArbor-TicketViolation2016.xls') as xls:
columns = ['Ticket #', 'Badge', 'Issue Date', 'IssueTime', 'Plate', 'State', 'Make', '', 'Violation', 'Description', 'Location', 'Meter', 'Fine', 'Penalty']
df2016_1 = pd.read_excel(xls, 'Sheet1', skiprows=0, header=1)
df2016_1.columns = columns
df2016_2 = pd.read_excel(xls, 'Sheet2', skiprows=0)
df2016_3 = pd.read_excel(xls, 'Sheet3', skiprows=0, skipfooter=1)
sheets2016 = [df2016_1, df2016_2, df2016_3]
df2016 = pd.concat(sheets2016)
with pd.ExcelFile('AnnArbor-TicketViolation2017.xls') as xls:
columns = ['Ticket #', 'Badge', 'Issue Date', 'IssueTime', 'Plate', 'State', 'Make', '', 'Violation', 'Description', 'Location', 'Meter', 'Fine', 'Penalty']
df2017_1 = pd.read_excel(xls, 'Sheet1', skiprows=0, header=1)
df2017_1.columns = columns
df2017_2 = pd.read_excel(xls, 'Sheet2', skiprows=0)
df2017_3 = pd.read_excel(xls, 'Sheet3', skiprows=0, skipfooter=1)
sheets2017 = [df2017_1, df2017_2, df2017_3]
df2017 = pd.concat(sheets2017)
with pd.ExcelFile('AnnArbor-TicketViolation2018.xls') as xls:
columns = ['Ticket #', 'Badge', 'Issue Date', 'IssueTime', 'Plate', 'State', 'Make', '', 'Violation', 'Description', 'Location', 'Meter', 'Fine', 'Penalty']
df2018_1 = pd.read_excel(xls, 'Sheet1', skiprows=0, header=1)
df2018_1.columns = columns
df2018_2 = pd.read_excel(xls, 'Sheet2', skiprows=0)
df2018_3 = pd.read_excel(xls, 'Sheet3', skiprows=0, skipfooter=1)
sheets2018 = [df2018_1, df2018_2, df2018_3]
df2018 = pd.concat(sheets2018)
with pd.ExcelFile('AnnArbor-TicketViolation2019.xls') as xls:
columns = ['Ticket #', 'Badge', 'Issue Date', 'IssueTime', 'Plate', 'State', 'Make', '', 'Violation', 'Description', 'Location', 'Meter', 'Fine', 'Penalty']
df2019_1 = pd.read_excel(xls, 'Sheet1', skiprows=0, header=1)
df2019_1.columns = columns
df2019_2 = pd.read_excel(xls, 'Sheet2', skiprows=0)
df2019_3 = pd.read_excel(xls, 'Sheet3', skiprows=0, skipfooter=1)
sheets2019 = [df2019_1, df2019_2, df2019_3]
df2019 = pd.concat(sheets2019)
with pd.ExcelFile('AnnArbor-TicketViolation-jan2020.xls') as xls:
df2020 = pd.read_excel(xls, 'Sheet1', skiprows=0, header=1)
df2020.columns = columns
Below is used to test my code:
import xlrd
import pandas as pd
import numpy as np
import re
df_1_test = load_ticket_data()
assert isinstance(df_1_test, pd.DataFrame), "Q1: What your function returns must be pd.DataFrame."
assert len(df_1_test) == 811439, "Q1: There should be 811439 rows in the dataframe."
assert len(df_1_test.columns) == 14, "Q1: There should be 14 columns in the dataframe."
AssertionError Traceback (most recent call last)
/tmp/ipykernel_88/3174118782.py in <cell line: 7>()
5
6 df_1_test = load_ticket_data()
----> 7 assert isinstance(df_1_test, pd.DataFrame), "Q1: What your function returns must be pd.DataFrame."
8 assert len(df_1_test) == 811439, "Q1: There should be 811439 rows in the dataframe."
9 assert len(df_1_test.columns) == 14, "Q1: There should be 14 columns in the dataframe."
AssertionError: Q1: What your function returns must be pd.DataFrame.
I would suggest to loop the folder with files and load each sheet into data frame, later load all dataframes into the dictionary.
example below:
import pandas as pd
import os
def load_ticket_():
column_names = ['Ticket #', 'Badge', 'Issue Date', 'IssueTime', 'Plate',
'State', 'Make', '', 'Violation', 'Description', 'Location', 'Meter', 'Fine', 'Penalty']
my_files = os.listdir() # my_files = os.listdir(path)
excel_fnames = list(filter(lambda f: f.endswith('.xlsx'), my_files))
df_dict = {} # empty dictionary for dataframes
for file in excel_fnames:
print(f"excel file: '{file}' is loaded")
if file != "AnnArbor-TicketViolation-jan2020.xls":
# read all sheets in one file and name the columns
sheets_to_df = pd.read_excel(file, sheet_name=None, names=column_names)
# append each sheet table to the dataframe per file
df_dict[file[-9:-5]] = pd.concat(sheets_to_df, axis=0, ignore_index=True)
else:
df_dict[file[-9:-5]] = pd.read_excel(file, sheet_name="Sheet1", names=column_names)
# renaming keys of dictionary to match original code
df_dict = {f'df{k}': v for k, v in df_dict.items()}
return df_dict
# calling function to load all dataframes to dictionary
final_dict_of_dfs = load_ticket_()
# show all df names stored in the dictionary
print(final_dict_of_dfs.keys())
# print specific df
print(final_dict_of_dfs['df2017'])
# assign new df from the item in the dictionary
df_2017 = final_dict_of_dfs["df2017"]
# or concat all dfs into one
master_df = pd.concat(final_dict_of_dfs.values(), ignore_index=True)
print(master_df)

Add new column to DataFrame with same default value

I would like to add a name column based on the 'lNames' list. But my code is overwriting the whole column in the last iteration as follows:
import pandas as pd
def consulta_bc(codigo_bcb):
url = 'http://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados?formato=json'.format(codigo_bcb)
df = pd.read_json(url)
df['data'] = pd.to_datetime(df['data'], dayfirst=True)
df.set_index('data', inplace=True)
return df
lCodigos = [12, 11, 1, 21619, 21623, 12466]
lNames = ['CDI', 'SELIC', 'USD', 'EUR', 'GPB', 'IMAB']
iter_len = len(lCodigos)
saida = pd.DataFrame()
for i in range(iter_len):
saida = saida.append(consulta_bc(lCodigos[i]))
saida['nome']= lNames[i]
saida.to_csv('Indice', sep=';', index=True)
saida
Any help will be fully appreciated
Change the for loop in this way:
for i in range(iter_len):
df = consulta_bc(lCodigos[i])
df['nome'] = lNames[i]
saida = saida.append(df)

Read Excel file without using Pandas and add new columns and print and output file

I am new to python coding and due to some issue, I need to reconfigure my code without pandas.
I am reading an Excel and extracting a few columns with filtered values. Then passing the one column value to a function to fetch the results. The result comes back in a complex dictionary format then I have to create a new column from the dictionary then join the two outputs (initial Excel file and complex dictionary) and print that back in the output file.
So my data is
Customer Customer Name Serial Number
1 XYZ 101011
2 XYZ 1020123
3 XYX 102344
Dictionary output
[{'cert': {'alternate_names': [],
'created_on': '2017-09-10T16:15:25.7599734Z',
'csr_used': False,
'error_details': '',
'revocation_date': None,
'revocation_status': None,
'serial_no': '101011',
'status': 'Expired',
'valid_to': '2020-09-09T23:59:59.0000000Z'},
'meta': {'api_application_biz_unit': '',
'api_client_nuid': '',
'asset_name': '',
'audience': 'External',
'automation_utility': '',
'delegate_owner': '',
'environment': 'Development',
'l2_group_email': None,
'l3_group_email': None,
'requestor_email': '',
'support_email': '',
'tech_delegate_email': None,
'tech_owner_email': None}}]
Desired output:
Customer Customer Name Serial Number Alternate_name Audience Environment
1 XYZ 101011 [] External Dev
My Code:
def create_excel(filename):
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28])
data["Customer Name"].fillna("N/A",inplace= True)
df = data[data['Customer Name'].str.contains("XYZ",case = False)]
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
df3 = pd.concat([pd.DataFrame(pd.json_normalize(x)) for x in df2['Output']],
ignore_index=False)
df3["Serial Number"] = df3.iloc[:,11]
df4 = pd.merge(left = df5, right = df3, how = 'left',
left_on = df5["Serial Number"].str.lower(),
right_on = df3["Serial Number"].str.lower())
df4.fillna("N/A",inplace = True)
df4["Status"] = df4.iloc[:,21].replace({"N/A":"Cust Not Found"},inplace = True)
df4["Status"] = df4.iloc[:,21]
df4["Serial Number"] = df4.iloc[:,4]
df4["Audience"] = df4.iloc[:,30]
df4["Environment"] = df4.iloc[:,33]
df4[["Customer","Customer Name","Serial Number","Common Name","Status",
"Environment","Audience"]].to_excel(r'Data.xlsx', index = False)
I want to remove the pandas dependency from the code. I am having a hard time figuring this out.

Creating a simple csv with synthetic data - Python

I'm learning python and machine learning and trying to create a very simple csv from synthetic data.
Can anyone help me tweak this to get it to work in PyCharm?
I'm trying to input a random value from the selection in each column.
Much appreciated
import random
import pandas as pd
marriage_status = {'single', 'married', 'divorced', 'widowed', 'complicated'}
children = {'yes', 'no'}
employment = {'employed', 'self_employed', 'unemployed', 'student'}
income_abroad = {'yes', 'no'}
gender = {'M', 'F'}
response = {'refund', 'payment'}
columns = ['marriage_status', 'children', 'employment',
'income_abroad', 'age', 'gender', 'income', 'expenses', 'response']
df = pd.DataFrame(columns=columns)
for i in range(1000):
marriage_status = random.choice(list(marriage_status))
children = random.choice(list(children))
employment = random.choice(list(employment))
income_abroad = random.choice(list(income_abroad))
gender = random.choice(list(gender))
response = random.choice(list(response))
age = random.randint(18, 70)
income = random.randint(0, 100000)
expenses = random.randint(0, 10000)
df = [marriage_status, children, employment, income_abroad, age, gender, income, expenses, response]
df[6].to_csv('taxfix_data.csv')
index = False
If you're going to use pandas the easiest way is to do it like this
import pandas as pd
df = pd.DataFrame(
{"marriage_status" : ['single' ,'married', 'divorced', 'widowed', 'complicated],
"children" : ['yes', 'no'],
"employment" : ['employed', 'self_employed', 'unemployed', 'student'],
"gender" : ['M', 'F'],
"response" : ['refund', 'payment'],
"income_abroad" : ['yes', 'no']}
index = [1, 2, 3])
Also here's a really useful cheatsheet for pandas https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

Categories