File name grouping and indexing - python
I have a folder with txt files like below,
and I have used os.listdir to generate a file list,
['acc_exp01_user01.txt', 'acc_exp02_user01.txt', 'acc_exp03_user02.txt', 'acc_exp04_user02.txt', 'acc_exp05_user03.txt', 'acc_exp06_user03.txt', 'acc_exp07_user04.txt', 'acc_exp08_user04.txt', 'acc_exp09_user05.txt', 'acc_exp10_user05.txt', 'acc_exp11_user06.txt', 'acc_exp12_user06.txt', 'acc_exp13_user07.txt', 'acc_exp14_user07.txt', 'acc_exp15_user08.txt', 'acc_exp16_user08.txt', 'acc_exp17_user09.txt', 'acc_exp18_user09.txt', 'acc_exp19_user10.txt', 'acc_exp20_user10.txt', 'acc_exp21_user10.txt', 'acc_exp22_user11.txt', 'acc_exp23_user11.txt', 'acc_exp24_user12.txt', 'acc_exp25_user12.txt', 'acc_exp26_user13.txt', 'acc_exp27_user13.txt', 'acc_exp28_user14.txt', 'acc_exp29_user14.txt', 'acc_exp30_user15.txt', 'acc_exp31_user15.txt', 'acc_exp32_user16.txt', 'acc_exp33_user16.txt', 'acc_exp34_user17.txt', 'acc_exp35_user17.txt', 'acc_exp36_user18.txt', 'acc_exp37_user18.txt', 'acc_exp38_user19.txt', 'acc_exp39_user19.txt', 'acc_exp40_user20.txt', 'acc_exp41_user20.txt', 'acc_exp42_user21.txt', 'acc_exp43_user21.txt', 'acc_exp44_user22.txt', 'acc_exp45_user22.txt', 'acc_exp46_user23.txt', 'acc_exp47_user23.txt', 'acc_exp48_user24.txt', 'acc_exp49_user24.txt', 'acc_exp50_user25.txt', 'acc_exp51_user25.txt', 'acc_exp52_user26.txt', 'acc_exp53_user26.txt', 'acc_exp54_user27.txt', 'acc_exp55_user27.txt', 'acc_exp56_user28.txt', 'acc_exp57_user28.txt', 'acc_exp58_user29.txt', 'acc_exp59_user29.txt', 'acc_exp60_user30.txt', 'acc_exp61_user30.txt', 'gyro_exp01_user01.txt', 'gyro_exp02_user01.txt', 'gyro_exp03_user02.txt', 'gyro_exp04_user02.txt', 'gyro_exp05_user03.txt', 'gyro_exp06_user03.txt', 'gyro_exp07_user04.txt', 'gyro_exp08_user04.txt', 'gyro_exp09_user05.txt', 'gyro_exp10_user05.txt', 'gyro_exp11_user06.txt', 'gyro_exp12_user06.txt', 'gyro_exp13_user07.txt', 'gyro_exp14_user07.txt', 'gyro_exp15_user08.txt', 'gyro_exp16_user08.txt', 'gyro_exp17_user09.txt', 'gyro_exp18_user09.txt', 'gyro_exp19_user10.txt', 'gyro_exp20_user10.txt', 'gyro_exp21_user10.txt', 'gyro_exp22_user11.txt', 'gyro_exp23_user11.txt', 'gyro_exp24_user12.txt', 'gyro_exp25_user12.txt', 'gyro_exp26_user13.txt', 'gyro_exp27_user13.txt', 'gyro_exp28_user14.txt', 'gyro_exp29_user14.txt', 'gyro_exp30_user15.txt', 'gyro_exp31_user15.txt', 'gyro_exp32_user16.txt', 'gyro_exp33_user16.txt', 'gyro_exp34_user17.txt', 'gyro_exp35_user17.txt', 'gyro_exp36_user18.txt', 'gyro_exp37_user18.txt', 'gyro_exp38_user19.txt', 'gyro_exp39_user19.txt', 'gyro_exp40_user20.txt', 'gyro_exp41_user20.txt', 'gyro_exp42_user21.txt', 'gyro_exp43_user21.txt', 'gyro_exp44_user22.txt', 'gyro_exp45_user22.txt', 'gyro_exp46_user23.txt', 'gyro_exp47_user23.txt', 'gyro_exp48_user24.txt', 'gyro_exp49_user24.txt', 'gyro_exp50_user25.txt', 'gyro_exp51_user25.txt', 'gyro_exp52_user26.txt', 'gyro_exp53_user26.txt', 'gyro_exp54_user27.txt', 'gyro_exp55_user27.txt', 'gyro_exp56_user28.txt', 'gyro_exp57_user28.txt', 'gyro_exp58_user29.txt', 'gyro_exp59_user29.txt', 'gyro_exp60_user30.txt', 'gyro_exp61_user30.txt', 'labels.txt']
but I want to now group into a indexing list like this,
how can I realise it?
You can use glob to find out files based out of a pattern from a path then create the required DataFrame
from glob import glob
import os
exp_path = "Your Path Here"
acc_pattern = "acc_exp*.csv"
gyro_pattern = "gyro_exp*.csv"
acc_files = glob(os.path.join(exp_path,acc_pattern))
gyro_files = glob(os.path.join(exp_path,gyro_pattern))
Once you have all the required files , we can create the DataFrame
df = pd.DataFrame()
df['acc'] = [os.path.basename(x) for x in acc_files]
df['gyro'] = [os.path.basename(x) for x in gyro_files]
df['experiment'] = df['acc'].apply(lambda x:x[7:9])
df['userId'] = df['acc'].apply(lambda x:x[14:16])
Related
supplementary quotes appearing in my csv using python code
I did a code to generate multiple addresses and export it in csv import csv import ipaddress import random from random import shuffle LAN = ipaddress.ip_network('192.168.0.0/16') WAN1 = ipaddress.ip_network('11.10.8.0/22') WAN2 = ipaddress.ip_network('12.10.8.0/22') LAN_IP_Adresses = [ IP_LAN for IP_LAN in LAN.hosts()] WAN1_IP_Adresses = [ IP_WAN1 for IP_WAN1 in WAN1.hosts()] WAN2_IP_Adresses = [ IP_WAN2 for IP_WAN2 in WAN2.hosts()] index_IP_GW = len(WAN1_IP_Adresses)-1 locations_list=['Abidjan','Abu Dhabi','Adana','Adelaide', 'Ahmadabad','Algiers','Amsterdam','Ankara','Anshan','Athens','BANGKOK','BUCHAREST','BUDAPEST','Bagdad','Bandung','Bangalore','Bangkok','Barcelona','Beirut','Belgrade','Bern','Bogota','Brasilia','Brazzaville','Brussels','Bursa','CAIRO','CARACAS','CONAKRY','Canberra','Casablanca','Changchun','Chengdu','Chicago','Copenhagen','Dakar','MINSK','Madrid','Medina','Nairobi','Napoli','Montreal', 'Odessa','Omdurman','Osaka','Ottawa','PYONGYANG','Paris','Pekin', 'Perth','Philadelphia','Phoenix','Pimpri Chinchwad','Porto','Porto Alegre','QUITO','Qingdao','Rabat','Rajkot','Riadh','Rio de Janeiro','Rome','SANTIAGO','Salvador','Samara','San Antonio','San Francisco','Sao Paulo','Sendai','Seongnam','Seoul','Shanghai','Singapore','Sydney','Taiyuan','Tehran','Tijuana','Tokyo','Toronto','Moscou','Moscow','Mumbai (Bombay)','Munich','México','Milan', 'Tripoli','Tunis','Vienna','Warsaw','Wuhan','Xian','Yaounde','Yokohama', 'Zapopan','hong kong','Dallas','Delhi','Doha','Dublin','Durban','Ecatepec','Frankfurt','Fukuoka','Giza','Hamburg','Havana','Helsinki','Houston','Hyderabad','Istanbul','Jaipur','Jakarta','Jeddah','Johannesburg','KIEV','Kaduna','Kano','Kazan','Kuala Lumpur''Kyoto','LUANDA','Lahore','Lanzhou','Le Caire','Leon','Lima','Lisbon','London','Los Angeles','Lyon','MANILA','Melbourne','New York'] #Site_Nmb=1 def initial_Sites_list_generator(filename='SITES_CI.csv', Number_of_Sites=1000): file_to_output = open(filename,'w',newline='') csv_writer = csv.writer(file_to_output,delimiter=',') Site_Nbr=1 index = 0 csv_writer.writerow(["SITE_NAME", "SERIAL_NUMBER",'"LAN_IP_ADDRESS"','"WAN_IP_ADDRESS1"','"WAN_IP_ADDRESS2"','"GATEWAY_IP_ADDRESS1"','"GATEWAY_IP_ADDRESS2"','"ROUTE_REFLECTOR"','"LOCATIONS"','"HARDWAREMODEL"','"LANINT"','"WANINT1"','"WANINT2"','"BW_OUT"','"BW_IN"']) for i in range(1,Number_of_Sites+1): shuffle(locations_list) location = random.choice(locations_list) csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"']) Site_Nbr = Site_Nbr+1 index = index+1 file_to_output.close() initial_Sites_list_generator('SITES_OVP.csv', 1000) but i got unnecessary quotes added in my csv
You are adding the extra quotes yourself. In your for loop, change this line: csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"']) to this: csv_writer.writerow(['SITE'+ str(Site_Nbr)+"2e70129bde9c4426b9213d4408c300", f'{(LAN_IP_Adresses[index])}', f'{str(WAN1_IP_Adresses[index])}', f'{str(WAN2_IP_Adresses[index])}', f'{str(WAN1_IP_Adresses[index_IP_GW])}', f'{str(WAN2_IP_Adresses[index_IP_GW])}', 'False', f'{location}', 'ONEv600', 'gigabitethernet0/2', 'gigabitethernet0/0', 'gigabitethernet0/1', '10', '20']) The CSV writer already adds quotes to strings as appropriate.
I did csv_writer = csv.writer(file_to_output,delimiter=",",quoting=csv.QUOTE_ALL) and it worked !
encoding Lithuanian character in xml using python
I have a code: def convert_df_to_xml(df,fd,ld): # sukuriam pagrindini elementa (root) su pavadinimu Invoices. root = ET.Element("Invoices") root.set("from", str(fd)) root.set("till", str(ld)) for i in range(len(df['partner_id'])): # pridedam sub elementa. invoices = ET.SubElement(root, "Invoice") invoices.set('clientid',df['company_registry'][i]) invoices.set('imones_pavadinimas', df['partner_id'][i]) # pridedam sub-sub elementa. quebec = ET.SubElement(invoices, "Product") # susikraunam eiluciu info is dataframe sectin_1 = ET.SubElement(quebec, "Name") sectin_1.text = str(df["Name"][i]) sectin_2 = ET.SubElement(quebec, 'Quantity') sectin_2.text = str(df["time_dif"][i]) sectin_3 = ET.SubElement(quebec, 'Price') sectin_3.text = str(df["price_unit"][i]) xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ", encoding="UTF-8").decode("UTF-8") with open("bandomasis_itp_xml_failas_V_1.1.xml", "w") as f: f.write(xmlstr) I'm creating xml file from python DataFrame. The problem is that in xml file I got "?" marks instead "ė" character. In dataframe i have strings with characters "ė,ą,š,ų" and I need them to be in xml file. My dataframe: df1 = pd.DataFrame({'partner_id': ['MED GRUPĖ, UAB'], 'Name':['Pirmas'], 'company_registry': ['3432543'], 'time_dif':['2'],'price_unit':['23']}) what is the problem with encoding here?
XML Parsing Python ElementTree - Nested for loops
I'm using Jupyter Notebook and ElementTree (Python 3) to create a dataframe and save as csv from an XML file. Here is the XML format (in Estonian): <asutused hetk="2020-04-14T03:53:33" ver="2"> <asutus> <registrikood>10000515</registrikood> <nimi>Osaühing B.Braun Medical</nimi> <aadress /> <tegevusload> <tegevusluba> <tegevusloa_number>L04647</tegevusloa_number> <alates>2019-12-10</alates> <kuni /> <loaliik_kood>1</loaliik_kood> <loaliik_nimi>Eriarstiabi</loaliik_nimi> <haiglaliik_kood /> <haiglaliik_nimi /> <tegevuskohad> <tegevuskoht> <aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress> <teenused> <teenus> <kood>T0038</kood> <nimi>ambulatoorsed üldkirurgiateenused</nimi> </teenus> <teenus> <kood>T0236</kood> <nimi>õe vastuvõtuteenus</nimi> </teenus> </teenused> </tegevuskoht> <tegevuskoht> <aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress> <teenused> <teenus> <kood>T0038</kood> <nimi>ambulatoorsed üldkirurgiateenused</nimi> </teenus> <teenus> <kood>T0236</kood> <nimi>õe vastuvõtuteenus</nimi> </teenus> </teenused> </tegevuskoht> </tegevuskohad> </tegevusluba> <tegevusluba> <tegevusloa_number>L04651</tegevusloa_number> <alates>2019-12-11</alates> <kuni /> <loaliik_kood>2</loaliik_kood> <loaliik_nimi>Õendusabi</loaliik_nimi> <haiglaliik_kood /> <haiglaliik_nimi /> <tegevuskohad> <tegevuskoht> <aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress> <teenused> <teenus> <kood>T0038</kood> <nimi>ambulatoorsed üldkirurgiateenused</nimi> </teenus> <teenus> <kood>T0236</kood> <nimi>õe vastuvõtuteenus</nimi> </teenus> </teenused> </tegevuskoht> <tegevuskoht> <aadress>Harju maakond, Tallinn, Mustamäe linnaosa, J. Sütiste tee 17/1</aadress> <teenused> <teenus> <kood>T0038</kood> <nimi>ambulatoorsed üldkirurgiateenused</nimi> </teenus> <teenus> <kood>T0236</kood> <nimi>õe vastuvõtuteenus</nimi> </teenus> </teenused> </tegevuskoht> </tegevuskohad> </tegevusluba> </tegevusload> <tootajad> <tootaja> <kood>D03091</kood> <eesnimi>Evo</eesnimi> <perenimi>Kaha</perenimi> <kutse_kood>11</kutse_kood> <kutse_nimi>Arst</kutse_nimi> <erialad> <eriala> <kood>E420</kood> <nimi>üldkirurgia</nimi> </eriala> </erialad> </tootaja> <tootaja> <kood>N01146</kood> <eesnimi>Karmen</eesnimi> <perenimi>Mežulis</perenimi> <kutse_kood>15</kutse_kood> <kutse_nimi>Õde</kutse_nimi> </tootaja> <tootaja> <kood>N01153</kood> <eesnimi>Nele</eesnimi> <perenimi>Terras</perenimi> <kutse_kood>15</kutse_kood> <kutse_nimi>Õde</kutse_nimi> </tootaja> <tootaja> <kood>N02767</kood> <eesnimi>Helena</eesnimi> <perenimi>Tern</perenimi> <kutse_kood>15</kutse_kood> <kutse_nimi>Õde</kutse_nimi> </tootaja> <tootaja> <kood>N12882</kood> <eesnimi>Hanna</eesnimi> <perenimi>Leemet</perenimi> <kutse_kood>15</kutse_kood> <kutse_nimi>Õde</kutse_nimi> </tootaja> </tootajad> </asutus> </asutused> Each "asutus" is a hospital and I need some of the information inside. Here is my code: tree = ET.parse("od_asutused.xml") root = tree.getroot() # open a file for writing data = open('EE.csv', 'w') # create the csv writer object csvwriter = csv.writer(data, delimiter=';') head = [] count = 0 for member in root.findall('asutus'): hospital = [] if count == 0: ident = member.find('registrikood').tag head.append(id) name = member.find('nimi').tag head.append(name) address = member.find('aadress').tag head.append(address) facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').tag head.append(facility_type) site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').tag head.append(site_address) for elem in member.findall('tegevusload'): list_specs = elem.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').tag head.append(list_specs) csvwriter.writerow(head) count = count + 1 ident = member.find('registrikood').text hospital.append(ident) name = member.find('nimi').text hospital.append(name) address = member.find('aadress').text hospital.append(address) facility_type = member.find('./tegevusload/tegevusluba/haiglaliik_nimi').text hospital.append(facility_type) site_address = member.find('./tegevusload/tegevusluba/tegevuskohad/tegevuskoht/aadress').text hospital.append(site_address) for spec in elem.findall('tegevusload'): list_specs = spec.find('./tegevusluba/tegevuskohad/tegevuskoht/teenused/teenus/nimi').text hospital.append(list_specs) csvwriter.writerow(hospital) data.close() #Upload csv for geocoding df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';') #Rename columns df.rename(columns = {'<built-in function id>':'id', 'nimi':'name', 'aadress':'address', 'haiglaliik_nimi':'facility_type', 'haiglaliik_kood':'facility_type_c', 'aadress.1':'site_address', 'nimi.1':'list_specs'}, inplace = True) #Add columns df['country'] = 'Estonia' df['cc'] = 'EE' df.head(10) And the result of the df.head(10): Result of dataframe The "list_specs" is blank no matter what I do. How can I populate this field with a list of each 'nimi' for each site address? Thank you.
I found in your code the following points to change: At least on my computer, calling csv.writer causes that newline chars are doubled. The remedy I found is to open the output file with additional parameters: data = open('EE.csv', 'w', newline='\n', encoding='utf-8') There is no sense to write head with Estonian column names and then rename the columns. Note also that in head.append(id) you use an undeclared variable (id). But this is not so important, as I changed this whole section with writing target column names (see below). As you write the CSV file to be read by read_csv, it should contain a fixed number of columns. So it is a bad practice to use a loop to write one element. Your instruction list_specs = elem.findall(...) was wrong, because elem is not set in the current loop. Instead you should use member (but I solved this detail other way). There is no sense to create a variable only in order to use it once. More concise and readable code is e.g. hospital.append(member.findtext('nimi')). To avoid long XPath expressions, with repeated initial part, I decided to set a temporary variable "in the middle" of this path, e.g. tgvLb = member.find('tegevusload/tegevusluba') and then use a relative XPath starting from this node. Your rename instruction contains one not needed column, namely facility_type_c. You read only 6 columns, not 7. So change the middle part of your code to: data = open('EE.csv', 'w', newline='\n', encoding='utf-8') csvwriter = csv.writer(data, delimiter=';') head = ['id', 'name', 'address', 'facility_type', 'site_address', 'list_specs'] csvwriter.writerow(head) for member in root.findall('asutus'): hospital = [] hospital.append(member.findtext('registrikood')) hospital.append(member.findtext('nimi')) hospital.append(member.findtext('aadress')) tgvLb = member.find('tegevusload/tegevusluba') hospital.append(tgvLb.findtext('haiglaliik_nimi')) tgvKoht = tgvLb.find('tegevuskohad/tegevuskoht') hospital.append(tgvKoht.findtext('aadress')) hospital.append(tgvKoht.findtext('teenused/teenus/nimi')) csvwriter.writerow(hospital) data.close() df = pd.read_csv(r'EE.csv', na_filter= False, delimiter=';') and drop df.rename from your code.
Python - How to count specific section in a list
I'm brand new to python and I'm struggling how to add certain sections of a cvs file in python. I'm not allowed to use "import cvs" I'm importing the TipJoke CVS file from https://vincentarelbundock.github.io/Rdatasets/datasets.html This is the only code I have so far that worked and I'm at a total loss on where to go from here. if __name__ == '__main__': from pprint import pprint from string import punctuation f = open("TipJoke.csv", "r") tipList = [] for line in f: #deletes the quotes line = line.replace('"', '') tipList.append(line) pprint(tipList[]) Output: [',Card,Tip,Ad,Joke,None\n', '1,None,1,0,0,1\n', '2,Joke,1,0,1,0\n', '3,Ad,0,1,0,0\n', '4,None,0,0,0,1\n', '5,None,1,0,0,1\n', '6,None,0,0,0,1\n', '7,Ad,0,1,0,0\n', '8,Ad,0,1,0,0\n', '9,None,0,0,0,1\n', '10,None,0,0,0,1\n', '11,None,1,0,0,1\n', '12,Ad,0,1,0,0\n', '13,None,0,0,0,1\n', '14,Ad,1,1,0,0\n', '15,Joke,1,0,1,0\n', '16,Joke,0,0,1,0\n', '17,Joke,1,0,1,0\n', '18,None,0,0,0,1\n', '19,Joke,0,0,1,0\n', '20,None,0,0,0,1\n', '21,Ad,1,1,0,0\n', '22,Ad,1,1,0,0\n', '23,Ad,0,1,0,0\n', '24,Joke,0,0,1,0\n', '25,Joke,1,0,1,0\n', '26,Joke,0,0,1,0\n', '27,None,1,0,0,1\n', '28,Joke,1,0,1,0\n', '29,Joke,1,0,1,0\n', '30,None,1,0,0,1\n', '31,Joke,0,0,1,0\n', '32,None,1,0,0,1\n', '33,Joke,1,0,1,0\n', '34,Ad,0,1,0,0\n', '35,Joke,0,0,1,0\n', '36,Ad,1,1,0,0\n', '37,Joke,0,0,1,0\n', '38,Ad,0,1,0,0\n', '39,Joke,0,0,1,0\n', '40,Joke,0,0,1,0\n', '41,Joke,1,0,1,0\n', '42,None,0,0,0,1\n', '43,None,0,0,0,1\n', '44,Ad,0,1,0,0\n', '45,None,0,0,0,1\n', '46,None,0,0,0,1\n', '47,Ad,0,1,0,0\n', '48,Joke,0,0,1,0\n', '49,Joke,1,0,1,0\n', '50,None,1,0,0,1\n', '51,None,0,0,0,1\n', '52,Joke,1,0,1,0\n', '53,Joke,1,0,1,0\n', '54,Joke,0,0,1,0\n', '55,None,1,0,0,1\n', '56,Ad,0,1,0,0\n', '57,Joke,0,0,1,0\n', '58,None,0,0,0,1\n', '59,Ad,0,1,0,0\n', '60,Joke,1,0,1,0\n', '61,Ad,0,1,0,0\n', '62,None,1,0,0,1\n', '63,Joke,0,0,1,0\n', '64,Ad,0,1,0,0\n', '65,Joke,0,0,1,0\n', '66,Ad,0,1,0,0\n', '67,Ad,0,1,0,0\n', '68,Ad,0,1,0,0\n', '69,None,0,0,0,1\n', '70,Joke,1,0,1,0\n', '71,None,1,0,0,1\n', '72,None,0,0,0,1\n', '73,None,0,0,0,1\n', '74,Joke,0,0,1,0\n', '75,Ad,1,1,0,0\n', '76,Ad,0,1,0,0\n', '77,Ad,1,1,0,0\n', '78,Joke,0,0,1,0\n', '79,Joke,0,0,1,0\n', '80,Ad,1,1,0,0\n', '81,Ad,0,1,0,0\n', '82,None,0,0,0,1\n', '83,Ad,0,1,0,0\n', '84,Joke,0,0,1,0\n', '85,Joke,0,0,1,0\n', '86,Ad,1,1,0,0\n', '87,None,1,0,0,1\n', '88,Joke,1,0,1,0\n', '89,Ad,0,1,0,0\n', '90,None,0,0,0,1\n', '91,None,0,0,0,1\n', '92,Joke,0,0,1,0\n', '93,Joke,0,0,1,0\n', '94,Ad,0,1,0,0\n', '95,Ad,0,1,0,0\n', '96,Ad,0,1,0,0\n', '97,Joke,1,0,1,0\n', '98,None,0,0,0,1\n', '99,None,0,0,0,1\n', '100,None,1,0,0,1\n', '101,Joke,0,0,1,0\n', '102,Joke,0,0,1,0\n', '103,Ad,1,1,0,0\n', '104,Ad,0,1,0,0\n', '105,Ad,0,1,0,0\n', '106,Ad,1,1,0,0\n', '107,Ad,0,1,0,0\n', '108,None,0,0,0,1\n', '109,Ad,0,1,0,0\n', '110,Joke,1,0,1,0\n', '111,None,0,0,0,1\n', '112,Ad,0,1,0,0\n', '113,Ad,0,1,0,0\n', '114,None,0,0,0,1\n', '115,Ad,0,1,0,0\n', '116,None,0,0,0,1\n', '117,None,0,0,0,1\n', '118,Ad,0,1,0,0\n', '119,None,1,0,0,1\n', '120,Ad,1,1,0,0\n', '121,Ad,0,1,0,0\n', '122,Ad,1,1,0,0\n', '123,None,0,0,0,1\n', '124,None,0,0,0,1\n', '125,Joke,1,0,1,0\n', '126,Joke,1,0,1,0\n', '127,Ad,0,1,0,0\n', '128,Joke,0,0,1,0\n', '129,Joke,0,0,1,0\n', '130,Ad,0,1,0,0\n', '131,None,0,0,0,1\n', '132,None,0,0,0,1\n', '133,None,0,0,0,1\n', '134,Joke,1,0,1,0\n', '135,Ad,0,1,0,0\n', '136,None,0,0,0,1\n', '137,Joke,0,0,1,0\n', '138,Ad,0,1,0,0\n', '139,Ad,0,1,0,0\n', '140,None,0,0,0,1\n', '141,Joke,0,0,1,0\n', '142,None,0,0,0,1\n', '143,Ad,0,1,0,0\n', '144,None,1,0,0,1\n', '145,Joke,0,0,1,0\n', '146,Ad,0,1,0,0\n', '147,Ad,0,1,0,0\n', '148,Ad,0,1,0,0\n', '149,Joke,1,0,1,0\n', '150,Ad,1,1,0,0\n', '151,Joke,1,0,1,0\n', '152,None,0,0,0,1\n', '153,Ad,0,1,0,0\n', '154,None,0,0,0,1\n', '155,None,0,0,0,1\n', '156,Ad,0,1,0,0\n', '157,Ad,0,1,0,0\n', '158,Joke,0,0,1,0\n', '159,None,0,0,0,1\n', '160,Joke,1,0,1,0\n', '161,None,1,0,0,1\n', '162,Ad,1,1,0,0\n', '163,Joke,0,0,1,0\n', '164,Joke,0,0,1,0\n', '165,Ad,0,1,0,0\n', '166,Joke,1,0,1,0\n', '167,Joke,1,0,1,0\n', '168,Ad,0,1,0,0\n', '169,Joke,1,0,1,0\n', '170,Joke,0,0,1,0\n', '171,Ad,0,1,0,0\n', '172,Joke,0,0,1,0\n', '173,Joke,0,0,1,0\n', '174,Ad,0,1,0,0\n', '175,None,0,0,0,1\n', '176,Joke,1,0,1,0\n', '177,Ad,0,1,0,0\n', '178,Joke,0,0,1,0\n', '179,Joke,0,0,1,0\n', '180,None,0,0,0,1\n', '181,None,0,0,0,1\n', '182,Ad,0,1,0,0\n', '183,None,0,0,0,1\n', '184,None,0,0,0,1\n', '185,None,0,0,0,1\n', '186,None,0,0,0,1\n', '187,Ad,0,1,0,0\n', '188,None,1,0,0,1\n', '189,Ad,0,1,0,0\n', '190,Ad,0,1,0,0\n', '191,Ad,0,1,0,0\n', '192,Joke,1,0,1,0\n', '193,Joke,0,0,1,0\n', '194,Ad,0,1,0,0\n', '195,None,0,0,0,1\n', '196,Joke,1,0,1,0\n', '197,Joke,0,0,1,0\n', '198,Joke,1,0,1,0\n', '199,Ad,0,1,0,0\n', '200,None,0,0,0,1\n', '201,Joke,1,0,1,0\n', '202,Joke,0,0,1,0\n', '203,Joke,0,0,1,0\n', '204,Ad,0,1,0,0\n', '205,None,0,0,0,1\n', '206,Ad,0,1,0,0\n', '207,Ad,0,1,0,0\n', '208,Joke,0,0,1,0\n', '209,Ad,0,1,0,0\n', '210,Joke,0,0,1,0\n', '211,None,0,0,0,1\n'] I'm currently trying to find the Total number of entries of the specified card type and the Percentage of tips given for the specified card type with two decimal places of precision. The tip column is the 0 or 1 right after the card type (None, Ad, Joke).
if you are allowed with pandas library then import pandas as pd df = pd.read_csv("TipJoke.csv") df is a pandas dataframe object in which you can perform multiple filtering task according to your need. for example if you want to get data for Joke you can filter like this: print(df[df["Card"] == "Joke"]) Though, i'm just providing you the direction , not whole logic for your question.
This works from pprint import pprint from string import punctuation counts = {"Joke": 0, "Ad": 0, "None": 0} with open("TipJoke.csv", "r") as f: for line in f: line_clean = line.replace('"', "").replace("\n", "").split(",") try: counts[line_clean[1]] += int(line_clean[2]) except: pass print(counts)
How to load a CSV with nested arrays
I came across a dataset of Twitter users (Kaggle Source) but I have found that the dataset has a rather strange format. It contains a row with column headers, and then rows of what are essentially JSON arrays. The dataset is also quite large which makes it very difficult to convert the entire file into JSON objects. What is a good way to load this data into Python, preferably a Pandas Dataframe? Example of Data id,screenName,tags,avatar,followersCount,friendsCount,lang,lastSeen,tweetId,friends "1969527638","LlngoMakeEmCum_",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/534286217882652672/FNmiQYVO_normal.jpeg",319,112,"en",1472271687519,"769310701580083200",[ "1969574754", "1969295556", "1969284056", "1969612214", "1970067476", "1969797386", "1969430539", "1969840064", "1969698176", "1970005154", "283011644", "1969901029", "1969563175", "1969302314", "1969978662", "1969457936", "1969667533", "1969547821", "1969943478", "1969668032", "283006529", "1969809440", "1969601096", "1969298856", "1969331652", "1969385498", "1969674368", "1969565263", "1970144676", "1969745390", "1969947438", "1969734134", "1969801326", "1969324008", "1969259820", "1969535827", "1970072989", "1969771688", "1969437804", "1969507394", "1969509972", "1969751588", "283012808", "1969302888", "1970224440", "1969603532", "283011244", "1969501046", "1969887518", "1970153138", "1970267527", "1969941955", "1969421654", "1970013110", "1969544905", "1969839590", "1969876500", "1969674625", "1969337952", "1970046536", "1970090934", "1969419133", "1969517215", "1969787869", "1969298065", "1970149771", "1969422638", "1969504268", "1970025554", "1969776001", "1970138611", "1969316186", "1969547558", "1969689272", "283009727", "283015491", "1969526874", "1969662210", "1969536164", "1969320008", "1969893793", "1970158393", "1969365936", "1970194418", "1969942094", "1969631580", "1969704756", "1969920092", "1969712882", "1969791680", "1969408164", "1969754851", "1970205480", "1969840267", "1969443211", "1969706762", "1969692698", "1969751576", "1969486796", "1969286630", "1969686674", "1969833492", "1969294814", "1969472719", "1969685018", "283008559", "283011243", "1969680078", "1969545697", "1969646412", "1969442725", "1969692529" ] "51878493","_notmichelle",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/761977602173046786/4_utEHsD_normal.jpg",275,115,"en",1472270622663,"769309490038439936",[ "60789485", "2420931980", "2899776756", "127410795", "38747286", "1345516880", "236076395", "1242946609", "2567887488", "280777286", "2912446303", "1149916171", "3192577639", "239569380", "229974168", "389097282", "266336410", "1850301204", "2364414805", "812302213", "2318240348", "158634793", "542282350", "569664772", "766573472", "703551325", "168564432", "261054460", "402980453", "562547390", "539630318", "165167145", "22216387", "427568285", "61033129", "213519434", "373092437", "170762012", "273601960", "322108757", "1681816280", "357843027", "737471496", "406541143", "1084122632", "633477616", "537821327", "793079732", "2386380799", "479015607", "783354019", "365171478", "625002575", "2326207404", "1653286842", "1676964216", "2296617326", "1583692190", "1315393903", "377660026", "2235123476", "792779641", "351222527", "444993309", "588396446", "377629159", "469383424", "1726612471", "415230430", "942443390", "360924168", "318593248", "565022085", "319679735", "632508305", "377638254", "1392782078", "584483723", "377703135", "180463340", "564978577", "502517645", "1056960042", "285097108", "410245879", "159121042", "570399371", "502348447", "960927356", "377196638", "478142245", "335043809", "73546116", "11348282", "901302409", "53255593", "515983155", "391774800", "62351523", "724792351", "346296289", "152520627", "559053427", "508019115", "349996133", "378859519", "65120103", "190070557", "339868374", "417355200", "256729771", "16171898", "45266183", "16143507", "165258639" ]
We could start with something like this: (Might need to rethink the use of | though. We could go for something more exotic like ╡ import pandas as pd import io import json data = '''\ id,screenName,tags,avatar,followersCount,friendsCount,lang,lastSeen,tweetId,friends "1969527638","LlngoMakeEmCum_",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/534286217882652672/FNmiQYVO_normal.jpeg",319,112,"en",1472271687519,"769310701580083200",[ "1969574754", "1969295556", "1969284056", "1969612214", "1970067476", "1969797386", "1969430539", "1969840064", "1969698176", "1970005154", "283011644", "1969901029", "1969563175", "1969302314", "1969978662", "1969457936", "1969667533", "1969547821", "1969943478", "1969668032", "283006529", "1969809440", "1969601096", "1969298856", "1969331652", "1969385498", "1969674368", "1969565263", "1970144676", "1969745390", "1969947438", "1969734134", "1969801326", "1969324008", "1969259820", "1969535827", "1970072989", "1969771688", "1969437804", "1969507394", "1969509972", "1969751588", "283012808", "1969302888", "1970224440", "1969603532", "283011244", "1969501046", "1969887518", "1970153138", "1970267527", "1969941955", "1969421654", "1970013110", "1969544905", "1969839590", "1969876500", "1969674625", "1969337952", "1970046536", "1970090934", "1969419133", "1969517215", "1969787869", "1969298065", "1970149771", "1969422638", "1969504268", "1970025554", "1969776001", "1970138611", "1969316186", "1969547558", "1969689272", "283009727", "283015491", "1969526874", "1969662210", "1969536164", "1969320008", "1969893793", "1970158393", "1969365936", "1970194418", "1969942094", "1969631580", "1969704756", "1969920092", "1969712882", "1969791680", "1969408164", "1969754851", "1970205480", "1969840267", "1969443211", "1969706762", "1969692698", "1969751576", "1969486796", "1969286630", "1969686674", "1969833492", "1969294814", "1969472719", "1969685018", "283008559", "283011243", "1969680078", "1969545697", "1969646412", "1969442725", "1969692529" ] "51878493","_notmichelle",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/761977602173046786/4_utEHsD_normal.jpg",275,115,"en",1472270622663,"769309490038439936",[ "60789485", "2420931980", "2899776756", "127410795", "38747286", "1345516880", "236076395", "1242946609", "2567887488", "280777286", "2912446303", "1149916171", "3192577639", "239569380", "229974168", "389097282", "266336410", "1850301204", "2364414805", "812302213", "2318240348", "158634793", "542282350", "569664772", "766573472", "703551325", "168564432", "261054460", "402980453", "562547390", "539630318", "165167145", "22216387", "427568285", "61033129", "213519434", "373092437", "170762012", "273601960", "322108757", "1681816280", "357843027", "737471496", "406541143", "1084122632", "633477616", "537821327", "793079732", "2386380799", "479015607", "783354019", "365171478", "625002575", "2326207404", "1653286842", "1676964216", "2296617326", "1583692190", "1315393903", "377660026", "2235123476", "792779641", "351222527", "444993309", "588396446", "377629159", "469383424", "1726612471", "415230430", "942443390", "360924168", "318593248", "565022085", "319679735", "632508305", "377638254", "1392782078", "584483723", "377703135", "180463340", "564978577", "502517645", "1056960042", "285097108", "410245879", "159121042", "570399371", "502348447", "960927356", "377196638", "478142245", "335043809", "73546116", "11348282", "901302409", "53255593", "515983155", "391774800", "62351523", "724792351", "346296289", "152520627", "559053427", "508019115", "349996133", "378859519", "65120103", "190070557", "339868374", "417355200", "256729771", "16171898", "45266183", "16143507", "165258639" ]''' # Create new separator (|) after 9th comma (',') data = '\n'.join(['|'.join(row.split(',',9)) for row in data.split('\n')]) # REPLACE WITH THIS FOR REAL FILE #with open('path/to/file') as f: #data = '\n'.join(['|'.join(row.split(',',9)) for row in f.read().split('\n')]) # Read dataframe df = pd.read_csv(io.StringIO(data), sep='|') # Convert strings to objects with json module: df['friends'] = df['friends'].apply(lambda x: json.loads(x)) df['tags'] = df['tags'].apply(lambda x: json.loads(x)) Safer approach: import pandas as pd import io import json with open('path/to/file') as f: columns, *rows = [row.split(',',9) for row in f.read().split('\n')] df = pd.DataFrame(rows, columns=columns) # Convert strings to objects with json module: df['friends'] = df['friends'].apply(lambda x: json.loads(x)) df['tags'] = df['tags'].apply(lambda x: json.loads(x))