File name grouping and indexing - python

I have a folder with txt files like below,
and I have used os.listdir to generate a file list,
['acc_exp01_user01.txt', 'acc_exp02_user01.txt', 'acc_exp03_user02.txt', 'acc_exp04_user02.txt', 'acc_exp05_user03.txt', 'acc_exp06_user03.txt', 'acc_exp07_user04.txt', 'acc_exp08_user04.txt', 'acc_exp09_user05.txt', 'acc_exp10_user05.txt', 'acc_exp11_user06.txt', 'acc_exp12_user06.txt', 'acc_exp13_user07.txt', 'acc_exp14_user07.txt', 'acc_exp15_user08.txt', 'acc_exp16_user08.txt', 'acc_exp17_user09.txt', 'acc_exp18_user09.txt', 'acc_exp19_user10.txt', 'acc_exp20_user10.txt', 'acc_exp21_user10.txt', 'acc_exp22_user11.txt', 'acc_exp23_user11.txt', 'acc_exp24_user12.txt', 'acc_exp25_user12.txt', 'acc_exp26_user13.txt', 'acc_exp27_user13.txt', 'acc_exp28_user14.txt', 'acc_exp29_user14.txt', 'acc_exp30_user15.txt', 'acc_exp31_user15.txt', 'acc_exp32_user16.txt', 'acc_exp33_user16.txt', 'acc_exp34_user17.txt', 'acc_exp35_user17.txt', 'acc_exp36_user18.txt', 'acc_exp37_user18.txt', 'acc_exp38_user19.txt', 'acc_exp39_user19.txt', 'acc_exp40_user20.txt', 'acc_exp41_user20.txt', 'acc_exp42_user21.txt', 'acc_exp43_user21.txt', 'acc_exp44_user22.txt', 'acc_exp45_user22.txt', 'acc_exp46_user23.txt', 'acc_exp47_user23.txt', 'acc_exp48_user24.txt', 'acc_exp49_user24.txt', 'acc_exp50_user25.txt', 'acc_exp51_user25.txt', 'acc_exp52_user26.txt', 'acc_exp53_user26.txt', 'acc_exp54_user27.txt', 'acc_exp55_user27.txt', 'acc_exp56_user28.txt', 'acc_exp57_user28.txt', 'acc_exp58_user29.txt', 'acc_exp59_user29.txt', 'acc_exp60_user30.txt', 'acc_exp61_user30.txt', 'gyro_exp01_user01.txt', 'gyro_exp02_user01.txt', 'gyro_exp03_user02.txt', 'gyro_exp04_user02.txt', 'gyro_exp05_user03.txt', 'gyro_exp06_user03.txt', 'gyro_exp07_user04.txt', 'gyro_exp08_user04.txt', 'gyro_exp09_user05.txt', 'gyro_exp10_user05.txt', 'gyro_exp11_user06.txt', 'gyro_exp12_user06.txt', 'gyro_exp13_user07.txt', 'gyro_exp14_user07.txt', 'gyro_exp15_user08.txt', 'gyro_exp16_user08.txt', 'gyro_exp17_user09.txt', 'gyro_exp18_user09.txt', 'gyro_exp19_user10.txt', 'gyro_exp20_user10.txt', 'gyro_exp21_user10.txt', 'gyro_exp22_user11.txt', 'gyro_exp23_user11.txt', 'gyro_exp24_user12.txt', 'gyro_exp25_user12.txt', 'gyro_exp26_user13.txt', 'gyro_exp27_user13.txt', 'gyro_exp28_user14.txt', 'gyro_exp29_user14.txt', 'gyro_exp30_user15.txt', 'gyro_exp31_user15.txt', 'gyro_exp32_user16.txt', 'gyro_exp33_user16.txt', 'gyro_exp34_user17.txt', 'gyro_exp35_user17.txt', 'gyro_exp36_user18.txt', 'gyro_exp37_user18.txt', 'gyro_exp38_user19.txt', 'gyro_exp39_user19.txt', 'gyro_exp40_user20.txt', 'gyro_exp41_user20.txt', 'gyro_exp42_user21.txt', 'gyro_exp43_user21.txt', 'gyro_exp44_user22.txt', 'gyro_exp45_user22.txt', 'gyro_exp46_user23.txt', 'gyro_exp47_user23.txt', 'gyro_exp48_user24.txt', 'gyro_exp49_user24.txt', 'gyro_exp50_user25.txt', 'gyro_exp51_user25.txt', 'gyro_exp52_user26.txt', 'gyro_exp53_user26.txt', 'gyro_exp54_user27.txt', 'gyro_exp55_user27.txt', 'gyro_exp56_user28.txt', 'gyro_exp57_user28.txt', 'gyro_exp58_user29.txt', 'gyro_exp59_user29.txt', 'gyro_exp60_user30.txt', 'gyro_exp61_user30.txt', 'labels.txt']
but I want to now group into a indexing list like this,
how can I realise it?

You can use glob to find out files based out of a pattern from a path then create the required DataFrame
from glob import glob
import os
exp_path = "Your Path Here"
acc_pattern = "acc_exp*.csv"
gyro_pattern = "gyro_exp*.csv"
acc_files = glob(os.path.join(exp_path,acc_pattern))
gyro_files = glob(os.path.join(exp_path,gyro_pattern))
Once you have all the required files , we can create the DataFrame
df = pd.DataFrame()
df['acc'] = [os.path.basename(x) for x in acc_files]
df['gyro'] = [os.path.basename(x) for x in gyro_files]
df['experiment'] = df['acc'].apply(lambda x:x[7:9])
df['userId'] = df['acc'].apply(lambda x:x[14:16])

Related

supplementary quotes appearing in my csv using python code

I did a code to generate multiple addresses and export it in csv
import csv
import ipaddress
import random
from random import shuffle
LAN = ipaddress.ip_network('192.168.0.0/16')
WAN1 = ipaddress.ip_network('11.10.8.0/22')
WAN2 = ipaddress.ip_network('12.10.8.0/22')
LAN_IP_Adresses = [ IP_LAN for IP_LAN in LAN.hosts()]
WAN1_IP_Adresses = [ IP_WAN1 for IP_WAN1 in WAN1.hosts()]
WAN2_IP_Adresses = [ IP_WAN2 for IP_WAN2 in WAN2.hosts()]
index_IP_GW = len(WAN1_IP_Adresses)-1
locations_list=['Abidjan','Abu Dhabi','Adana','Adelaide', 'Ahmadabad','Algiers','Amsterdam','Ankara','Anshan','Athens','BANGKOK','BUCHAREST','BUDAPEST','Bagdad','Bandung','Bangalore','Bangkok','Barcelona','Beirut','Belgrade','Bern','Bogota','Brasilia','Brazzaville','Brussels','Bursa','CAIRO','CARACAS','CONAKRY','Canberra','Casablanca','Changchun','Chengdu','Chicago','Copenhagen','Dakar','MINSK','Madrid','Medina','Nairobi','Napoli','Montreal',
'Odessa','Omdurman','Osaka','Ottawa','PYONGYANG','Paris','Pekin', 'Perth','Philadelphia','Phoenix','Pimpri Chinchwad','Porto','Porto Alegre','QUITO','Qingdao','Rabat','Rajkot','Riadh','Rio de Janeiro','Rome','SANTIAGO','Salvador','Samara','San Antonio','San Francisco','Sao Paulo','Sendai','Seongnam','Seoul','Shanghai','Singapore','Sydney','Taiyuan','Tehran','Tijuana','Tokyo','Toronto','Moscou','Moscow','Mumbai (Bombay)','Munich','México','Milan',
'Tripoli','Tunis','Vienna','Warsaw','Wuhan','Xian','Yaounde','Yokohama', 'Zapopan','hong kong','Dallas','Delhi','Doha','Dublin','Durban','Ecatepec','Frankfurt','Fukuoka','Giza','Hamburg','Havana','Helsinki','Houston','Hyderabad','Istanbul','Jaipur','Jakarta','Jeddah','Johannesburg','KIEV','Kaduna','Kano','Kazan','Kuala Lumpur''Kyoto','LUANDA','Lahore','Lanzhou','Le Caire','Leon','Lima','Lisbon','London','Los Angeles','Lyon','MANILA','Melbourne','New York']
#Site_Nmb=1
def initial_Sites_list_generator(filename='SITES_CI.csv', Number_of_Sites=1000):
file_to_output = open(filename,'w',newline='')
csv_writer = csv.writer(file_to_output,delimiter=',')
Site_Nbr=1
index = 0
csv_writer.writerow(["SITE_NAME", "SERIAL_NUMBER",'"LAN_IP_ADDRESS"','"WAN_IP_ADDRESS1"','"WAN_IP_ADDRESS2"','"GATEWAY_IP_ADDRESS1"','"GATEWAY_IP_ADDRESS2"','"ROUTE_REFLECTOR"','"LOCATIONS"','"HARDWAREMODEL"','"LANINT"','"WANINT1"','"WANINT2"','"BW_OUT"','"BW_IN"'])
for i in range(1,Number_of_Sites+1):
shuffle(locations_list)
location = random.choice(locations_list)
csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"'])
Site_Nbr = Site_Nbr+1
index = index+1
file_to_output.close()
initial_Sites_list_generator('SITES_OVP.csv', 1000)
but i got unnecessary quotes added in my csv
You are adding the extra quotes yourself. In your for loop, change this line:
csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"'])
to this:
csv_writer.writerow(['SITE'+ str(Site_Nbr)+"2e70129bde9c4426b9213d4408c300",
f'{(LAN_IP_Adresses[index])}',
f'{str(WAN1_IP_Adresses[index])}',
f'{str(WAN2_IP_Adresses[index])}',
f'{str(WAN1_IP_Adresses[index_IP_GW])}',
f'{str(WAN2_IP_Adresses[index_IP_GW])}',
'False',
f'{location}',
'ONEv600',
'gigabitethernet0/2',
'gigabitethernet0/0',
'gigabitethernet0/1',
'10',
'20'])
The CSV writer already adds quotes to strings as appropriate.
I did
csv_writer = csv.writer(file_to_output,delimiter=",",quoting=csv.QUOTE_ALL)
and it worked !

encoding Lithuanian character in xml using python

I have a code:
def convert_df_to_xml(df,fd,ld):
# sukuriam pagrindini elementa (root) su pavadinimu Invoices.
root = ET.Element("Invoices")
root.set("from", str(fd))
root.set("till", str(ld))
for i in range(len(df['partner_id'])):
# pridedam sub elementa.
invoices = ET.SubElement(root, "Invoice")
invoices.set('clientid',df['company_registry'][i])
invoices.set('imones_pavadinimas', df['partner_id'][i])
# pridedam sub-sub elementa.
quebec = ET.SubElement(invoices, "Product")
# susikraunam eiluciu info is dataframe
sectin_1 = ET.SubElement(quebec, "Name")
sectin_1.text = str(df["Name"][i])
sectin_2 = ET.SubElement(quebec, 'Quantity')
sectin_2.text = str(df["time_dif"][i])
sectin_3 = ET.SubElement(quebec, 'Price')
sectin_3.text = str(df["price_unit"][i])
xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ", encoding="UTF-8").decode("UTF-8")
with open("bandomasis_itp_xml_failas_V_1.1.xml", "w") as f:
f.write(xmlstr)
I'm creating xml file from python DataFrame. The problem is that in xml file I got "?" marks instead "ė" character.
In dataframe i have strings with characters "ė,ą,š,ų" and I need them to be in xml file.
My dataframe:
df1 = pd.DataFrame({'partner_id': ['MED GRUPĖ, UAB'], 'Name':['Pirmas'], 'company_registry': ['3432543'],
'time_dif':['2'],'price_unit':['23']})
what is the problem with encoding here?

XML Parsing Python ElementTree - Nested for loops

I'm using Jupyter Notebook and ElementTree (Python 3) to create a dataframe and save as csv from an XML file. Here is the XML format (in Estonian):
<asutused hetk="2020-04-14T03:53:33" ver="2">
<asutus>
<registrikood>10000515</registrikood>
<nimi>Osaühing B.Braun Medical</nimi>
<aadress />
<tegevusload>
<tegevusluba>
<tegevusloa_number>L04647</tegevusloa_number>
<alates>2019-12-10</alates>
<kuni />
<loaliik_kood>1</loaliik_kood>
<loaliik_nimi>Eriarstiabi</loaliik_nimi>
<haiglaliik_kood />
<haiglaliik_nimi />
<tegevuskohad>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
</tegevuskohad>
</tegevusluba>
<tegevusluba>
<tegevusloa_number>L04651</tegevusloa_number>
<alates>2019-12-11</alates>
<kuni />
<loaliik_kood>2</loaliik_kood>
<loaliik_nimi>Õendusabi</loaliik_nimi>
<haiglaliik_kood />
<haiglaliik_nimi />
<tegevuskohad>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
<tegevuskoht>
<aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress>
<teenused>
<teenus>
<kood>T0038</kood>
<nimi>ambulatoorsed üldkirurgiateenused</nimi>
</teenus>
<teenus>
<kood>T0236</kood>
<nimi>õe vastuvõtuteenus</nimi>
</teenus>
</teenused>
</tegevuskoht>
</tegevuskohad>
</tegevusluba>
</tegevusload>
<tootajad>
<tootaja>
<kood>D03091</kood>
<eesnimi>Evo</eesnimi>
<perenimi>Kaha</perenimi>
<kutse_kood>11</kutse_kood>
<kutse_nimi>Arst</kutse_nimi>
<erialad>
<eriala>
<kood>E420</kood>
<nimi>üldkirurgia</nimi>
</eriala>
</erialad>
</tootaja>
<tootaja>
<kood>N01146</kood>
<eesnimi>Karmen</eesnimi>
<perenimi>Mežulis</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N01153</kood>
<eesnimi>Nele</eesnimi>
<perenimi>Terras</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N02767</kood>
<eesnimi>Helena</eesnimi>
<perenimi>Tern</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
<tootaja>
<kood>N12882</kood>
<eesnimi>Hanna</eesnimi>
<perenimi>Leemet</perenimi>
<kutse_kood>15</kutse_kood>
<kutse_nimi>Õde</kutse_nimi>
</tootaja>
</tootajad>
</asutus>
</asutused>
Each "asutus" is a hospital and I need some of the information inside. Here is my code:
tree = ET.parse("od_asutused.xml")
root = tree.getroot()
# open a file for writing
data = open('EE.csv', 'w')
# create the csv writer object
csvwriter = csv.writer(data, delimiter=';')
head = []
count = 0
for member in root.findall('asutus'):
hospital = []
if count == 0:
ident = member.find('registrikood').tag
head.append(id)
name = member.find('nimi').tag
head.append(name)
address = member.find('aadress').tag
head.append(address)
facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').tag
head.append(facility_type)
site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').tag
head.append(site_address)
for elem in member.findall('tegevusload'):
list_specs = elem.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').tag
head.append(list_specs)
csvwriter.writerow(head)
count = count + 1
ident = member.find('registrikood').text
hospital.append(ident)
name = member.find('nimi').text
hospital.append(name)
address = member.find('aadress').text
hospital.append(address)
facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').text
hospital.append(facility_type)
site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').text
hospital.append(site_address)
for spec in elem.findall('tegevusload'):
list_specs = spec.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').text
hospital.append(list_specs)
csvwriter.writerow(hospital)
data.close()
#Upload csv for geocoding
df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';')
#Rename columns
df.rename(columns = {'<built-in function id>':'id',
'nimi':'name',
'aadress':'address',
'haiglaliik_nimi':'facility_type',
'haiglaliik_kood':'facility_type_c',
'aadress.1':'site_address',
'nimi.1':'list_specs'},
inplace = True)
#Add columns
df['country'] = 'Estonia'
df['cc'] = 'EE'
df.head(10)
And the result of the df.head(10):
Result of dataframe
The "list_specs" is blank no matter what I do. How can I populate this field with a list of each 'nimi' for each site address? Thank you.
I found in your code the following points to change:
At least on my computer, calling csv.writer causes that newline chars
are doubled. The remedy I found is to open the output file with
additional parameters:
data = open('EE.csv', 'w', newline='\n', encoding='utf-8')
There is no sense to write head with Estonian column names and then
rename the columns. Note also that in head.append(id) you use an undeclared
variable (id).
But this is not so important, as I changed this whole section with writing
target column names (see below).
As you write the CSV file to be read by read_csv, it should contain a
fixed number of columns. So it is a bad practice to use a loop to write
one element.
Your instruction list_specs = elem.findall(...) was wrong, because
elem is not set in the current loop. Instead you should use member (but
I solved this detail other way).
There is no sense to create a variable only in order to use it once.
More concise and readable code is e.g. hospital.append(member.findtext('nimi')).
To avoid long XPath expressions, with repeated initial part, I decided
to set a temporary variable "in the middle" of this path, e.g.
tgvLb = member.find('tegevusload/tegevusluba') and then use a relative
XPath starting from this node.
Your rename instruction contains one not needed column, namely facility_type_c. You read only 6 columns, not 7.
So change the middle part of your code to:
data = open('EE.csv', 'w', newline='\n', encoding='utf-8')
csvwriter = csv.writer(data, delimiter=';')
head = ['id', 'name', 'address', 'facility_type', 'site_address', 'list_specs']
csvwriter.writerow(head)
for member in root.findall('asutus'):
hospital = []
hospital.append(member.findtext('registrikood'))
hospital.append(member.findtext('nimi'))
hospital.append(member.findtext('aadress'))
tgvLb = member.find('tegevusload/tegevusluba')
hospital.append(tgvLb.findtext('haiglaliik_nimi'))
tgvKoht = tgvLb.find('tegevuskohad/tegevuskoht')
hospital.append(tgvKoht.findtext('aadress'))
hospital.append(tgvKoht.findtext('teenused/teenus/nimi'))
csvwriter.writerow(hospital)
data.close()
df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';')
and drop df.rename from your code.

Python - How to count specific section in a list

I'm brand new to python and I'm struggling how to add certain sections of a cvs file in python. I'm not allowed to use "import cvs"
I'm importing the TipJoke CVS file from https://vincentarelbundock.github.io/Rdatasets/datasets.html
This is the only code I have so far that worked and I'm at a total loss on where to go from here.
if __name__ == '__main__':
from pprint import pprint
from string import punctuation
f = open("TipJoke.csv", "r")
tipList = []
for line in f:
#deletes the quotes
line = line.replace('"', '')
tipList.append(line)
pprint(tipList[])
Output:
[',Card,Tip,Ad,Joke,None\n',
'1,None,1,0,0,1\n',
'2,Joke,1,0,1,0\n',
'3,Ad,0,1,0,0\n',
'4,None,0,0,0,1\n',
'5,None,1,0,0,1\n',
'6,None,0,0,0,1\n',
'7,Ad,0,1,0,0\n',
'8,Ad,0,1,0,0\n',
'9,None,0,0,0,1\n',
'10,None,0,0,0,1\n',
'11,None,1,0,0,1\n',
'12,Ad,0,1,0,0\n',
'13,None,0,0,0,1\n',
'14,Ad,1,1,0,0\n',
'15,Joke,1,0,1,0\n',
'16,Joke,0,0,1,0\n',
'17,Joke,1,0,1,0\n',
'18,None,0,0,0,1\n',
'19,Joke,0,0,1,0\n',
'20,None,0,0,0,1\n',
'21,Ad,1,1,0,0\n',
'22,Ad,1,1,0,0\n',
'23,Ad,0,1,0,0\n',
'24,Joke,0,0,1,0\n',
'25,Joke,1,0,1,0\n',
'26,Joke,0,0,1,0\n',
'27,None,1,0,0,1\n',
'28,Joke,1,0,1,0\n',
'29,Joke,1,0,1,0\n',
'30,None,1,0,0,1\n',
'31,Joke,0,0,1,0\n',
'32,None,1,0,0,1\n',
'33,Joke,1,0,1,0\n',
'34,Ad,0,1,0,0\n',
'35,Joke,0,0,1,0\n',
'36,Ad,1,1,0,0\n',
'37,Joke,0,0,1,0\n',
'38,Ad,0,1,0,0\n',
'39,Joke,0,0,1,0\n',
'40,Joke,0,0,1,0\n',
'41,Joke,1,0,1,0\n',
'42,None,0,0,0,1\n',
'43,None,0,0,0,1\n',
'44,Ad,0,1,0,0\n',
'45,None,0,0,0,1\n',
'46,None,0,0,0,1\n',
'47,Ad,0,1,0,0\n',
'48,Joke,0,0,1,0\n',
'49,Joke,1,0,1,0\n',
'50,None,1,0,0,1\n',
'51,None,0,0,0,1\n',
'52,Joke,1,0,1,0\n',
'53,Joke,1,0,1,0\n',
'54,Joke,0,0,1,0\n',
'55,None,1,0,0,1\n',
'56,Ad,0,1,0,0\n',
'57,Joke,0,0,1,0\n',
'58,None,0,0,0,1\n',
'59,Ad,0,1,0,0\n',
'60,Joke,1,0,1,0\n',
'61,Ad,0,1,0,0\n',
'62,None,1,0,0,1\n',
'63,Joke,0,0,1,0\n',
'64,Ad,0,1,0,0\n',
'65,Joke,0,0,1,0\n',
'66,Ad,0,1,0,0\n',
'67,Ad,0,1,0,0\n',
'68,Ad,0,1,0,0\n',
'69,None,0,0,0,1\n',
'70,Joke,1,0,1,0\n',
'71,None,1,0,0,1\n',
'72,None,0,0,0,1\n',
'73,None,0,0,0,1\n',
'74,Joke,0,0,1,0\n',
'75,Ad,1,1,0,0\n',
'76,Ad,0,1,0,0\n',
'77,Ad,1,1,0,0\n',
'78,Joke,0,0,1,0\n',
'79,Joke,0,0,1,0\n',
'80,Ad,1,1,0,0\n',
'81,Ad,0,1,0,0\n',
'82,None,0,0,0,1\n',
'83,Ad,0,1,0,0\n',
'84,Joke,0,0,1,0\n',
'85,Joke,0,0,1,0\n',
'86,Ad,1,1,0,0\n',
'87,None,1,0,0,1\n',
'88,Joke,1,0,1,0\n',
'89,Ad,0,1,0,0\n',
'90,None,0,0,0,1\n',
'91,None,0,0,0,1\n',
'92,Joke,0,0,1,0\n',
'93,Joke,0,0,1,0\n',
'94,Ad,0,1,0,0\n',
'95,Ad,0,1,0,0\n',
'96,Ad,0,1,0,0\n',
'97,Joke,1,0,1,0\n',
'98,None,0,0,0,1\n',
'99,None,0,0,0,1\n',
'100,None,1,0,0,1\n',
'101,Joke,0,0,1,0\n',
'102,Joke,0,0,1,0\n',
'103,Ad,1,1,0,0\n',
'104,Ad,0,1,0,0\n',
'105,Ad,0,1,0,0\n',
'106,Ad,1,1,0,0\n',
'107,Ad,0,1,0,0\n',
'108,None,0,0,0,1\n',
'109,Ad,0,1,0,0\n',
'110,Joke,1,0,1,0\n',
'111,None,0,0,0,1\n',
'112,Ad,0,1,0,0\n',
'113,Ad,0,1,0,0\n',
'114,None,0,0,0,1\n',
'115,Ad,0,1,0,0\n',
'116,None,0,0,0,1\n',
'117,None,0,0,0,1\n',
'118,Ad,0,1,0,0\n',
'119,None,1,0,0,1\n',
'120,Ad,1,1,0,0\n',
'121,Ad,0,1,0,0\n',
'122,Ad,1,1,0,0\n',
'123,None,0,0,0,1\n',
'124,None,0,0,0,1\n',
'125,Joke,1,0,1,0\n',
'126,Joke,1,0,1,0\n',
'127,Ad,0,1,0,0\n',
'128,Joke,0,0,1,0\n',
'129,Joke,0,0,1,0\n',
'130,Ad,0,1,0,0\n',
'131,None,0,0,0,1\n',
'132,None,0,0,0,1\n',
'133,None,0,0,0,1\n',
'134,Joke,1,0,1,0\n',
'135,Ad,0,1,0,0\n',
'136,None,0,0,0,1\n',
'137,Joke,0,0,1,0\n',
'138,Ad,0,1,0,0\n',
'139,Ad,0,1,0,0\n',
'140,None,0,0,0,1\n',
'141,Joke,0,0,1,0\n',
'142,None,0,0,0,1\n',
'143,Ad,0,1,0,0\n',
'144,None,1,0,0,1\n',
'145,Joke,0,0,1,0\n',
'146,Ad,0,1,0,0\n',
'147,Ad,0,1,0,0\n',
'148,Ad,0,1,0,0\n',
'149,Joke,1,0,1,0\n',
'150,Ad,1,1,0,0\n',
'151,Joke,1,0,1,0\n',
'152,None,0,0,0,1\n',
'153,Ad,0,1,0,0\n',
'154,None,0,0,0,1\n',
'155,None,0,0,0,1\n',
'156,Ad,0,1,0,0\n',
'157,Ad,0,1,0,0\n',
'158,Joke,0,0,1,0\n',
'159,None,0,0,0,1\n',
'160,Joke,1,0,1,0\n',
'161,None,1,0,0,1\n',
'162,Ad,1,1,0,0\n',
'163,Joke,0,0,1,0\n',
'164,Joke,0,0,1,0\n',
'165,Ad,0,1,0,0\n',
'166,Joke,1,0,1,0\n',
'167,Joke,1,0,1,0\n',
'168,Ad,0,1,0,0\n',
'169,Joke,1,0,1,0\n',
'170,Joke,0,0,1,0\n',
'171,Ad,0,1,0,0\n',
'172,Joke,0,0,1,0\n',
'173,Joke,0,0,1,0\n',
'174,Ad,0,1,0,0\n',
'175,None,0,0,0,1\n',
'176,Joke,1,0,1,0\n',
'177,Ad,0,1,0,0\n',
'178,Joke,0,0,1,0\n',
'179,Joke,0,0,1,0\n',
'180,None,0,0,0,1\n',
'181,None,0,0,0,1\n',
'182,Ad,0,1,0,0\n',
'183,None,0,0,0,1\n',
'184,None,0,0,0,1\n',
'185,None,0,0,0,1\n',
'186,None,0,0,0,1\n',
'187,Ad,0,1,0,0\n',
'188,None,1,0,0,1\n',
'189,Ad,0,1,0,0\n',
'190,Ad,0,1,0,0\n',
'191,Ad,0,1,0,0\n',
'192,Joke,1,0,1,0\n',
'193,Joke,0,0,1,0\n',
'194,Ad,0,1,0,0\n',
'195,None,0,0,0,1\n',
'196,Joke,1,0,1,0\n',
'197,Joke,0,0,1,0\n',
'198,Joke,1,0,1,0\n',
'199,Ad,0,1,0,0\n',
'200,None,0,0,0,1\n',
'201,Joke,1,0,1,0\n',
'202,Joke,0,0,1,0\n',
'203,Joke,0,0,1,0\n',
'204,Ad,0,1,0,0\n',
'205,None,0,0,0,1\n',
'206,Ad,0,1,0,0\n',
'207,Ad,0,1,0,0\n',
'208,Joke,0,0,1,0\n',
'209,Ad,0,1,0,0\n',
'210,Joke,0,0,1,0\n',
'211,None,0,0,0,1\n']
I'm currently trying to find the Total number of entries of the specified card type and the Percentage of tips given for the specified card type with two decimal places of precision. The tip column is the 0 or 1 right after the card type (None, Ad, Joke).
if you are allowed with pandas library then
import pandas as pd
df = pd.read_csv("TipJoke.csv")
df is a pandas dataframe object in which you can perform multiple filtering task according to your need.
for example if you want to get data for Joke you can filter like this:
print(df[df["Card"] == "Joke"])
Though, i'm just providing you the direction , not whole logic for your question.
This works
from pprint import pprint
from string import punctuation
counts = {"Joke": 0, "Ad": 0, "None": 0}
with open("TipJoke.csv", "r") as f:
for line in f:
line_clean = line.replace('"', "").replace("\n", "").split(",")
try:
counts[line_clean[1]] += int(line_clean[2])
except:
pass
print(counts)

How to load a CSV with nested arrays

I came across a dataset of Twitter users (Kaggle Source) but I have found that the dataset has a rather strange format. It contains a row with column headers, and then rows of what are essentially JSON arrays. The dataset is also quite large which makes it very difficult to convert the entire file into JSON objects.
What is a good way to load this data into Python, preferably a Pandas Dataframe?
Example of Data
id,screenName,tags,avatar,followersCount,friendsCount,lang,lastSeen,tweetId,friends
"1969527638","LlngoMakeEmCum_",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/534286217882652672/FNmiQYVO_normal.jpeg",319,112,"en",1472271687519,"769310701580083200",[ "1969574754", "1969295556", "1969284056", "1969612214", "1970067476", "1969797386", "1969430539", "1969840064", "1969698176", "1970005154", "283011644", "1969901029", "1969563175", "1969302314", "1969978662", "1969457936", "1969667533", "1969547821", "1969943478", "1969668032", "283006529", "1969809440", "1969601096", "1969298856", "1969331652", "1969385498", "1969674368", "1969565263", "1970144676", "1969745390", "1969947438", "1969734134", "1969801326", "1969324008", "1969259820", "1969535827", "1970072989", "1969771688", "1969437804", "1969507394", "1969509972", "1969751588", "283012808", "1969302888", "1970224440", "1969603532", "283011244", "1969501046", "1969887518", "1970153138", "1970267527", "1969941955", "1969421654", "1970013110", "1969544905", "1969839590", "1969876500", "1969674625", "1969337952", "1970046536", "1970090934", "1969419133", "1969517215", "1969787869", "1969298065", "1970149771", "1969422638", "1969504268", "1970025554", "1969776001", "1970138611", "1969316186", "1969547558", "1969689272", "283009727", "283015491", "1969526874", "1969662210", "1969536164", "1969320008", "1969893793", "1970158393", "1969365936", "1970194418", "1969942094", "1969631580", "1969704756", "1969920092", "1969712882", "1969791680", "1969408164", "1969754851", "1970205480", "1969840267", "1969443211", "1969706762", "1969692698", "1969751576", "1969486796", "1969286630", "1969686674", "1969833492", "1969294814", "1969472719", "1969685018", "283008559", "283011243", "1969680078", "1969545697", "1969646412", "1969442725", "1969692529" ]
"51878493","_notmichelle",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/761977602173046786/4_utEHsD_normal.jpg",275,115,"en",1472270622663,"769309490038439936",[ "60789485", "2420931980", "2899776756", "127410795", "38747286", "1345516880", "236076395", "1242946609", "2567887488", "280777286", "2912446303", "1149916171", "3192577639", "239569380", "229974168", "389097282", "266336410", "1850301204", "2364414805", "812302213", "2318240348", "158634793", "542282350", "569664772", "766573472", "703551325", "168564432", "261054460", "402980453", "562547390", "539630318", "165167145", "22216387", "427568285", "61033129", "213519434", "373092437", "170762012", "273601960", "322108757", "1681816280", "357843027", "737471496", "406541143", "1084122632", "633477616", "537821327", "793079732", "2386380799", "479015607", "783354019", "365171478", "625002575", "2326207404", "1653286842", "1676964216", "2296617326", "1583692190", "1315393903", "377660026", "2235123476", "792779641", "351222527", "444993309", "588396446", "377629159", "469383424", "1726612471", "415230430", "942443390", "360924168", "318593248", "565022085", "319679735", "632508305", "377638254", "1392782078", "584483723", "377703135", "180463340", "564978577", "502517645", "1056960042", "285097108", "410245879", "159121042", "570399371", "502348447", "960927356", "377196638", "478142245", "335043809", "73546116", "11348282", "901302409", "53255593", "515983155", "391774800", "62351523", "724792351", "346296289", "152520627", "559053427", "508019115", "349996133", "378859519", "65120103", "190070557", "339868374", "417355200", "256729771", "16171898", "45266183", "16143507", "165258639" ]
We could start with something like this:
(Might need to rethink the use of | though. We could go for something more exotic like ╡
import pandas as pd
import io
import json
data = '''\
id,screenName,tags,avatar,followersCount,friendsCount,lang,lastSeen,tweetId,friends
"1969527638","LlngoMakeEmCum_",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/534286217882652672/FNmiQYVO_normal.jpeg",319,112,"en",1472271687519,"769310701580083200",[ "1969574754", "1969295556", "1969284056", "1969612214", "1970067476", "1969797386", "1969430539", "1969840064", "1969698176", "1970005154", "283011644", "1969901029", "1969563175", "1969302314", "1969978662", "1969457936", "1969667533", "1969547821", "1969943478", "1969668032", "283006529", "1969809440", "1969601096", "1969298856", "1969331652", "1969385498", "1969674368", "1969565263", "1970144676", "1969745390", "1969947438", "1969734134", "1969801326", "1969324008", "1969259820", "1969535827", "1970072989", "1969771688", "1969437804", "1969507394", "1969509972", "1969751588", "283012808", "1969302888", "1970224440", "1969603532", "283011244", "1969501046", "1969887518", "1970153138", "1970267527", "1969941955", "1969421654", "1970013110", "1969544905", "1969839590", "1969876500", "1969674625", "1969337952", "1970046536", "1970090934", "1969419133", "1969517215", "1969787869", "1969298065", "1970149771", "1969422638", "1969504268", "1970025554", "1969776001", "1970138611", "1969316186", "1969547558", "1969689272", "283009727", "283015491", "1969526874", "1969662210", "1969536164", "1969320008", "1969893793", "1970158393", "1969365936", "1970194418", "1969942094", "1969631580", "1969704756", "1969920092", "1969712882", "1969791680", "1969408164", "1969754851", "1970205480", "1969840267", "1969443211", "1969706762", "1969692698", "1969751576", "1969486796", "1969286630", "1969686674", "1969833492", "1969294814", "1969472719", "1969685018", "283008559", "283011243", "1969680078", "1969545697", "1969646412", "1969442725", "1969692529" ]
"51878493","_notmichelle",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/761977602173046786/4_utEHsD_normal.jpg",275,115,"en",1472270622663,"769309490038439936",[ "60789485", "2420931980", "2899776756", "127410795", "38747286", "1345516880", "236076395", "1242946609", "2567887488", "280777286", "2912446303", "1149916171", "3192577639", "239569380", "229974168", "389097282", "266336410", "1850301204", "2364414805", "812302213", "2318240348", "158634793", "542282350", "569664772", "766573472", "703551325", "168564432", "261054460", "402980453", "562547390", "539630318", "165167145", "22216387", "427568285", "61033129", "213519434", "373092437", "170762012", "273601960", "322108757", "1681816280", "357843027", "737471496", "406541143", "1084122632", "633477616", "537821327", "793079732", "2386380799", "479015607", "783354019", "365171478", "625002575", "2326207404", "1653286842", "1676964216", "2296617326", "1583692190", "1315393903", "377660026", "2235123476", "792779641", "351222527", "444993309", "588396446", "377629159", "469383424", "1726612471", "415230430", "942443390", "360924168", "318593248", "565022085", "319679735", "632508305", "377638254", "1392782078", "584483723", "377703135", "180463340", "564978577", "502517645", "1056960042", "285097108", "410245879", "159121042", "570399371", "502348447", "960927356", "377196638", "478142245", "335043809", "73546116", "11348282", "901302409", "53255593", "515983155", "391774800", "62351523", "724792351", "346296289", "152520627", "559053427", "508019115", "349996133", "378859519", "65120103", "190070557", "339868374", "417355200", "256729771", "16171898", "45266183", "16143507", "165258639" ]'''
# Create new separator (|) after 9th comma (',')
data = '\n'.join(['|'.join(row.split(',',9)) for row in data.split('\n')])
# REPLACE WITH THIS FOR REAL FILE
#with open('path/to/file') as f:
#data = '\n'.join(['|'.join(row.split(',',9)) for row in f.read().split('\n')])
# Read dataframe
df = pd.read_csv(io.StringIO(data), sep='|')
# Convert strings to objects with json module:
df['friends'] = df['friends'].apply(lambda x: json.loads(x))
df['tags'] = df['tags'].apply(lambda x: json.loads(x))
Safer approach:
import pandas as pd
import io
import json
with open('path/to/file') as f:
columns, *rows = [row.split(',',9) for row in f.read().split('\n')]
df = pd.DataFrame(rows, columns=columns)
# Convert strings to objects with json module:
df['friends'] = df['friends'].apply(lambda x: json.loads(x))
df['tags'] = df['tags'].apply(lambda x: json.loads(x))

Categories