Python - How to count specific section in a list - python

I'm brand new to python and I'm struggling how to add certain sections of a cvs file in python. I'm not allowed to use "import cvs"
I'm importing the TipJoke CVS file from https://vincentarelbundock.github.io/Rdatasets/datasets.html
This is the only code I have so far that worked and I'm at a total loss on where to go from here.
if __name__ == '__main__':
from pprint import pprint
from string import punctuation
f = open("TipJoke.csv", "r")
tipList = []
for line in f:
#deletes the quotes
line = line.replace('"', '')
tipList.append(line)
pprint(tipList[])
Output:
[',Card,Tip,Ad,Joke,None\n',
'1,None,1,0,0,1\n',
'2,Joke,1,0,1,0\n',
'3,Ad,0,1,0,0\n',
'4,None,0,0,0,1\n',
'5,None,1,0,0,1\n',
'6,None,0,0,0,1\n',
'7,Ad,0,1,0,0\n',
'8,Ad,0,1,0,0\n',
'9,None,0,0,0,1\n',
'10,None,0,0,0,1\n',
'11,None,1,0,0,1\n',
'12,Ad,0,1,0,0\n',
'13,None,0,0,0,1\n',
'14,Ad,1,1,0,0\n',
'15,Joke,1,0,1,0\n',
'16,Joke,0,0,1,0\n',
'17,Joke,1,0,1,0\n',
'18,None,0,0,0,1\n',
'19,Joke,0,0,1,0\n',
'20,None,0,0,0,1\n',
'21,Ad,1,1,0,0\n',
'22,Ad,1,1,0,0\n',
'23,Ad,0,1,0,0\n',
'24,Joke,0,0,1,0\n',
'25,Joke,1,0,1,0\n',
'26,Joke,0,0,1,0\n',
'27,None,1,0,0,1\n',
'28,Joke,1,0,1,0\n',
'29,Joke,1,0,1,0\n',
'30,None,1,0,0,1\n',
'31,Joke,0,0,1,0\n',
'32,None,1,0,0,1\n',
'33,Joke,1,0,1,0\n',
'34,Ad,0,1,0,0\n',
'35,Joke,0,0,1,0\n',
'36,Ad,1,1,0,0\n',
'37,Joke,0,0,1,0\n',
'38,Ad,0,1,0,0\n',
'39,Joke,0,0,1,0\n',
'40,Joke,0,0,1,0\n',
'41,Joke,1,0,1,0\n',
'42,None,0,0,0,1\n',
'43,None,0,0,0,1\n',
'44,Ad,0,1,0,0\n',
'45,None,0,0,0,1\n',
'46,None,0,0,0,1\n',
'47,Ad,0,1,0,0\n',
'48,Joke,0,0,1,0\n',
'49,Joke,1,0,1,0\n',
'50,None,1,0,0,1\n',
'51,None,0,0,0,1\n',
'52,Joke,1,0,1,0\n',
'53,Joke,1,0,1,0\n',
'54,Joke,0,0,1,0\n',
'55,None,1,0,0,1\n',
'56,Ad,0,1,0,0\n',
'57,Joke,0,0,1,0\n',
'58,None,0,0,0,1\n',
'59,Ad,0,1,0,0\n',
'60,Joke,1,0,1,0\n',
'61,Ad,0,1,0,0\n',
'62,None,1,0,0,1\n',
'63,Joke,0,0,1,0\n',
'64,Ad,0,1,0,0\n',
'65,Joke,0,0,1,0\n',
'66,Ad,0,1,0,0\n',
'67,Ad,0,1,0,0\n',
'68,Ad,0,1,0,0\n',
'69,None,0,0,0,1\n',
'70,Joke,1,0,1,0\n',
'71,None,1,0,0,1\n',
'72,None,0,0,0,1\n',
'73,None,0,0,0,1\n',
'74,Joke,0,0,1,0\n',
'75,Ad,1,1,0,0\n',
'76,Ad,0,1,0,0\n',
'77,Ad,1,1,0,0\n',
'78,Joke,0,0,1,0\n',
'79,Joke,0,0,1,0\n',
'80,Ad,1,1,0,0\n',
'81,Ad,0,1,0,0\n',
'82,None,0,0,0,1\n',
'83,Ad,0,1,0,0\n',
'84,Joke,0,0,1,0\n',
'85,Joke,0,0,1,0\n',
'86,Ad,1,1,0,0\n',
'87,None,1,0,0,1\n',
'88,Joke,1,0,1,0\n',
'89,Ad,0,1,0,0\n',
'90,None,0,0,0,1\n',
'91,None,0,0,0,1\n',
'92,Joke,0,0,1,0\n',
'93,Joke,0,0,1,0\n',
'94,Ad,0,1,0,0\n',
'95,Ad,0,1,0,0\n',
'96,Ad,0,1,0,0\n',
'97,Joke,1,0,1,0\n',
'98,None,0,0,0,1\n',
'99,None,0,0,0,1\n',
'100,None,1,0,0,1\n',
'101,Joke,0,0,1,0\n',
'102,Joke,0,0,1,0\n',
'103,Ad,1,1,0,0\n',
'104,Ad,0,1,0,0\n',
'105,Ad,0,1,0,0\n',
'106,Ad,1,1,0,0\n',
'107,Ad,0,1,0,0\n',
'108,None,0,0,0,1\n',
'109,Ad,0,1,0,0\n',
'110,Joke,1,0,1,0\n',
'111,None,0,0,0,1\n',
'112,Ad,0,1,0,0\n',
'113,Ad,0,1,0,0\n',
'114,None,0,0,0,1\n',
'115,Ad,0,1,0,0\n',
'116,None,0,0,0,1\n',
'117,None,0,0,0,1\n',
'118,Ad,0,1,0,0\n',
'119,None,1,0,0,1\n',
'120,Ad,1,1,0,0\n',
'121,Ad,0,1,0,0\n',
'122,Ad,1,1,0,0\n',
'123,None,0,0,0,1\n',
'124,None,0,0,0,1\n',
'125,Joke,1,0,1,0\n',
'126,Joke,1,0,1,0\n',
'127,Ad,0,1,0,0\n',
'128,Joke,0,0,1,0\n',
'129,Joke,0,0,1,0\n',
'130,Ad,0,1,0,0\n',
'131,None,0,0,0,1\n',
'132,None,0,0,0,1\n',
'133,None,0,0,0,1\n',
'134,Joke,1,0,1,0\n',
'135,Ad,0,1,0,0\n',
'136,None,0,0,0,1\n',
'137,Joke,0,0,1,0\n',
'138,Ad,0,1,0,0\n',
'139,Ad,0,1,0,0\n',
'140,None,0,0,0,1\n',
'141,Joke,0,0,1,0\n',
'142,None,0,0,0,1\n',
'143,Ad,0,1,0,0\n',
'144,None,1,0,0,1\n',
'145,Joke,0,0,1,0\n',
'146,Ad,0,1,0,0\n',
'147,Ad,0,1,0,0\n',
'148,Ad,0,1,0,0\n',
'149,Joke,1,0,1,0\n',
'150,Ad,1,1,0,0\n',
'151,Joke,1,0,1,0\n',
'152,None,0,0,0,1\n',
'153,Ad,0,1,0,0\n',
'154,None,0,0,0,1\n',
'155,None,0,0,0,1\n',
'156,Ad,0,1,0,0\n',
'157,Ad,0,1,0,0\n',
'158,Joke,0,0,1,0\n',
'159,None,0,0,0,1\n',
'160,Joke,1,0,1,0\n',
'161,None,1,0,0,1\n',
'162,Ad,1,1,0,0\n',
'163,Joke,0,0,1,0\n',
'164,Joke,0,0,1,0\n',
'165,Ad,0,1,0,0\n',
'166,Joke,1,0,1,0\n',
'167,Joke,1,0,1,0\n',
'168,Ad,0,1,0,0\n',
'169,Joke,1,0,1,0\n',
'170,Joke,0,0,1,0\n',
'171,Ad,0,1,0,0\n',
'172,Joke,0,0,1,0\n',
'173,Joke,0,0,1,0\n',
'174,Ad,0,1,0,0\n',
'175,None,0,0,0,1\n',
'176,Joke,1,0,1,0\n',
'177,Ad,0,1,0,0\n',
'178,Joke,0,0,1,0\n',
'179,Joke,0,0,1,0\n',
'180,None,0,0,0,1\n',
'181,None,0,0,0,1\n',
'182,Ad,0,1,0,0\n',
'183,None,0,0,0,1\n',
'184,None,0,0,0,1\n',
'185,None,0,0,0,1\n',
'186,None,0,0,0,1\n',
'187,Ad,0,1,0,0\n',
'188,None,1,0,0,1\n',
'189,Ad,0,1,0,0\n',
'190,Ad,0,1,0,0\n',
'191,Ad,0,1,0,0\n',
'192,Joke,1,0,1,0\n',
'193,Joke,0,0,1,0\n',
'194,Ad,0,1,0,0\n',
'195,None,0,0,0,1\n',
'196,Joke,1,0,1,0\n',
'197,Joke,0,0,1,0\n',
'198,Joke,1,0,1,0\n',
'199,Ad,0,1,0,0\n',
'200,None,0,0,0,1\n',
'201,Joke,1,0,1,0\n',
'202,Joke,0,0,1,0\n',
'203,Joke,0,0,1,0\n',
'204,Ad,0,1,0,0\n',
'205,None,0,0,0,1\n',
'206,Ad,0,1,0,0\n',
'207,Ad,0,1,0,0\n',
'208,Joke,0,0,1,0\n',
'209,Ad,0,1,0,0\n',
'210,Joke,0,0,1,0\n',
'211,None,0,0,0,1\n']
I'm currently trying to find the Total number of entries of the specified card type and the Percentage of tips given for the specified card type with two decimal places of precision. The tip column is the 0 or 1 right after the card type (None, Ad, Joke).

if you are allowed with pandas library then
import pandas as pd
df = pd.read_csv("TipJoke.csv")
df is a pandas dataframe object in which you can perform multiple filtering task according to your need.
for example if you want to get data for Joke you can filter like this:
print(df[df["Card"] == "Joke"])
Though, i'm just providing you the direction , not whole logic for your question.

This works
from pprint import pprint
from string import punctuation
counts = {"Joke": 0, "Ad": 0, "None": 0}
with open("TipJoke.csv", "r") as f:
for line in f:
line_clean = line.replace('"', "").replace("\n", "").split(",")
try:
counts[line_clean[1]] += int(line_clean[2])
except:
pass
print(counts)

Related

supplementary quotes appearing in my csv using python code

I did a code to generate multiple addresses and export it in csv
import csv
import ipaddress
import random
from random import shuffle
LAN = ipaddress.ip_network('192.168.0.0/16')
WAN1 = ipaddress.ip_network('11.10.8.0/22')
WAN2 = ipaddress.ip_network('12.10.8.0/22')
LAN_IP_Adresses = [ IP_LAN for IP_LAN in LAN.hosts()]
WAN1_IP_Adresses = [ IP_WAN1 for IP_WAN1 in WAN1.hosts()]
WAN2_IP_Adresses = [ IP_WAN2 for IP_WAN2 in WAN2.hosts()]
index_IP_GW = len(WAN1_IP_Adresses)-1
locations_list=['Abidjan','Abu Dhabi','Adana','Adelaide', 'Ahmadabad','Algiers','Amsterdam','Ankara','Anshan','Athens','BANGKOK','BUCHAREST','BUDAPEST','Bagdad','Bandung','Bangalore','Bangkok','Barcelona','Beirut','Belgrade','Bern','Bogota','Brasilia','Brazzaville','Brussels','Bursa','CAIRO','CARACAS','CONAKRY','Canberra','Casablanca','Changchun','Chengdu','Chicago','Copenhagen','Dakar','MINSK','Madrid','Medina','Nairobi','Napoli','Montreal',
'Odessa','Omdurman','Osaka','Ottawa','PYONGYANG','Paris','Pekin', 'Perth','Philadelphia','Phoenix','Pimpri Chinchwad','Porto','Porto Alegre','QUITO','Qingdao','Rabat','Rajkot','Riadh','Rio de Janeiro','Rome','SANTIAGO','Salvador','Samara','San Antonio','San Francisco','Sao Paulo','Sendai','Seongnam','Seoul','Shanghai','Singapore','Sydney','Taiyuan','Tehran','Tijuana','Tokyo','Toronto','Moscou','Moscow','Mumbai (Bombay)','Munich','México','Milan',
'Tripoli','Tunis','Vienna','Warsaw','Wuhan','Xian','Yaounde','Yokohama', 'Zapopan','hong kong','Dallas','Delhi','Doha','Dublin','Durban','Ecatepec','Frankfurt','Fukuoka','Giza','Hamburg','Havana','Helsinki','Houston','Hyderabad','Istanbul','Jaipur','Jakarta','Jeddah','Johannesburg','KIEV','Kaduna','Kano','Kazan','Kuala Lumpur''Kyoto','LUANDA','Lahore','Lanzhou','Le Caire','Leon','Lima','Lisbon','London','Los Angeles','Lyon','MANILA','Melbourne','New York']
#Site_Nmb=1
def initial_Sites_list_generator(filename='SITES_CI.csv', Number_of_Sites=1000):
file_to_output = open(filename,'w',newline='')
csv_writer = csv.writer(file_to_output,delimiter=',')
Site_Nbr=1
index = 0
csv_writer.writerow(["SITE_NAME", "SERIAL_NUMBER",'"LAN_IP_ADDRESS"','"WAN_IP_ADDRESS1"','"WAN_IP_ADDRESS2"','"GATEWAY_IP_ADDRESS1"','"GATEWAY_IP_ADDRESS2"','"ROUTE_REFLECTOR"','"LOCATIONS"','"HARDWAREMODEL"','"LANINT"','"WANINT1"','"WANINT2"','"BW_OUT"','"BW_IN"'])
for i in range(1,Number_of_Sites+1):
shuffle(locations_list)
location = random.choice(locations_list)
csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"'])
Site_Nbr = Site_Nbr+1
index = index+1
file_to_output.close()
initial_Sites_list_generator('SITES_OVP.csv', 1000)
but i got unnecessary quotes added in my csv
You are adding the extra quotes yourself. In your for loop, change this line:
csv_writer.writerow(['"SITE'+ str(Site_Nbr)+'"',"2e70129bde9c4426b9213d4408c300",f'"{(LAN_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index])}"',f'"{str(WAN2_IP_Adresses[index])}"',f'"{str(WAN1_IP_Adresses[index_IP_GW])}"',f'"{str(WAN2_IP_Adresses[index_IP_GW])}"','"False"',f'"{location}"','"ONEv600"','"gigabitethernet0/2"','"gigabitethernet0/0"','"gigabitethernet0/1"','"10"','"20"'])
to this:
csv_writer.writerow(['SITE'+ str(Site_Nbr)+"2e70129bde9c4426b9213d4408c300",
f'{(LAN_IP_Adresses[index])}',
f'{str(WAN1_IP_Adresses[index])}',
f'{str(WAN2_IP_Adresses[index])}',
f'{str(WAN1_IP_Adresses[index_IP_GW])}',
f'{str(WAN2_IP_Adresses[index_IP_GW])}',
'False',
f'{location}',
'ONEv600',
'gigabitethernet0/2',
'gigabitethernet0/0',
'gigabitethernet0/1',
'10',
'20'])
The CSV writer already adds quotes to strings as appropriate.
I did
csv_writer = csv.writer(file_to_output,delimiter=",",quoting=csv.QUOTE_ALL)
and it worked !

Extract letters after $ symbol using Pandas

I am trying to extract just the data upto and including the $ symbol from a spreadsheet.
I have isolated the data to give me just the column containing the data but what I am trying to do is extract any and all symbols that follow a $ symbol.
For example:
$AAPL $LOW $TSLA and so on from the entire dataset but I don't need or want $1000 $600 and so on - just letters only and either a period or a space follows but just the characters a-z is what I am trying to get.
I haven't been successful in full extraction and my code is starting to get messy so I'll provide the code that will bring back the data for you to see for yourself. I am using Jupyter Notebook.
import mysql.connector
import pandas
googleSheedID = '15fhpxqWDRWkNtEFhi9bQyWUg8pDn4B-R2N18s1xFYTU'
worksheetName = 'Sheet1'
URL = 'https://docs.google.com/spreadsheets/d/{0}/gviz/tq?tqx=out:csv&sheet={1}'.format(
googleSheedID,
worksheetName
)
df = pandas.read_csv(URL)
del df['DATE']
del df['USERNAME']
del df['LINK']
del df['LINK2']
df[df["TWEET"].str.contains("RT")==False]
print(df)
Not sure if I understand what you want correctly, but the following codes give all elements that comes after $ before (blank space).
import mysql.connector
import pandas
googleSheedID = '15fhpxqWDRWkNtEFhi9bQyWUg8pDn4B-R2N18s1xFYTU'
worksheetName = 'Sheet1'
URL = 'https://docs.google.com/spreadsheets/d/{0}/gviz/tq?tqx=out:csv&sheet={1}'.format(
googleSheedID,
worksheetName
)
df = pandas.read_csv(URL)
del df['DATE']
del df['USERNAME']
del df['LINK']
del df['LINK2']
unique_results = []
for i in range(len(df['TWEET'])):
if 'RT' in df["TWEET"][i]:
continue
else:
for j in range(len(df['TWEET'][i])-1):
if df['TWEET'][i][j] == '$':
if df['TWEET'][i][j+1] == '1' or df['TWEET'][i][j+1] == '2' or df['TWEET'][i][j+1] == '3' or\
df['TWEET'][i][j+1] == '4' or df['TWEET'][i][j+1] == '5' or df['TWEET'][i][j+1] == '6' or\
df['TWEET'][i][j+1] == '7' or df['TWEET'][i][j+1] == '8' or df['TWEET'][i][j+1] == '9' or df['TWEET'][i][j+1] == '0':
continue
else:
start = j
for k in range(start, len(df['TWEET'][i])):
if df['TWEET'][i][k] == ' ' or df['TWEET'][i][k:k+1] == '\n':
end = k
break
results = df['TWEET'][i][start:end]
if results not in unique_results:
unique_results.append(results)
print(unique_results)
edit: fixed the code
The outputs are:
['$GME', '$SNDL', '$FUBO', '$AMC', '$LOTZ', '$CLOV', '$USAS', '$AIHS', '$PLM', '$LODE', '$TTNP', '$IMTE', '', '$NAK.', '$NAK', '$CRBP', '$AREC', '$NTEC', '$NTN', '$CBAT', '$ZYNE', '$HOFV', '$GWPH', '$KERN', '$ZYNE,', '$AIM', '$WWR', '$CARV', '$VISL', '$SINO', '$NAKD', '$GRPS', '$RSHN', '$MARA', '$RIOT', '$NXTD', '$LAC', '$BTC', '$ITRM', '$CHCI', '$VERU', '$GMGI', '$WNBD', '$KALV', '$EGOC', '$Veru', '$MRNA', '$PVDG', '$DROP', '$EFOI', '$LLIT', '$AUVI', '$CGIX', '$RELI', '$TLRY', '$ACB', '$TRCH', '$TRCH.', '$TSLA', '$cciv', '$sndl', '$ANCN', '$TGC', '$tlry', '$KXIN', '$AMZN', '$INFI', '$LMND', '$COMS', '$VXX', '$LEDS', '$ACY', '$RHE', '$SINO.', '$GPL', '$SPCE', '$OXY', '$CLSN', '$FTFT', '$FTFT.....', '$BIEI', '$EDRY', '$CLEU', '$FSR', '$SPY', '$NIO', '$LI', '$XPEV,', '$UL', '$RGLG', '$SOS', '$QS', '$THCB', '$SUNW', '$MICT', '$BTC.X', '$T', '$ADOM', '$EBON', '$CLPS', '$HIHO', '$ONTX', '$WNRS', '$SOLO', '$Mara,', '$Riot,', '$SOS,', '$GRNQ,', '$RCON,', '$FTFT,', '$BTBT,', '$MOGO,', '$EQOS,', '$CCNC', '$CCIV', '$tsla', '$fsr', '$wkhs', '$ride', '$nio', '$NETE', '$DPW', '$MOSY', '$SSNT', '$PLTR', '$GSAH:', '$EQOS', '$MTSL', '$CMPS', '$CHIF', '$MU', '$HST', '$SNAP', '$CTXR', '$acy', '$FUBOTV', '$DPBE', '$HYLN', '$SPOT', '$NSAV', '$HYLN,', '$aabb', '$AAL', '$BBIG', '$ITNS', '$CTIB', '$AMPG', '$ZI', '$NUVI', '$INTC', '$TSM', '$AAPL', '$MRJT', '$RCMT', '$IZEA', '$BBIG,', '$ARKK', '$LIAUTO', '$MARA:', '$SOS:', '$XOM', '$ET', '$BRNW', '$SYPR', '$LCID', '$QCOM', '$FIZZ', '$TRVG', '$SLV', '$RAFA', '$TGCTengasco,', '$BYND', '$XTNT', '$NBY', '$sos', '$KMPH', '$', '$(0.60)', '$(0.64)', '$BIDU', '$rkt', '$GTT', '$CHUC', '$CLF', '$INUV', '$RKT', '$COST', '$MDCN', '$HCMC', '$UWMC', '$riot', '$OVID', '$HZON', '$SKT', '$FB', '$PLUG', '$BA', '$PYPL', '$PSTH.', '$NVDA', '$AMPG.', '$aese.', '$spy', '$pltr', '$MSFT', '$AMD', '$QQQ', '$LTNC', '$WKHS', '$EYES', '$RMO', '$GNUS', '$gme', '$mdmp', '$kern', '$AEI', '$BABA', '$YALA', '$TWTR', '$WISH', '$GE', '$ORCL', '$JUPW', '$TMBR', '$SSYS', '$NKE', '$AMPGAmpliTech', '$$$', '$$', '$RGLS', '$HOGE', '$GEGR', '$nclh', '$IGAC', '$FCEL', '$TKAT', '$OCG', '$YVR', '$IPDN.', '$IPDN', "$SINO's", '$WIMI', '$TKAT.', '$BAC', '$LZR', '$LGHL', '$F', '$GM', '$KODK', '$atvk', '$ATVK', '$AIKI', '$DS', '$AI', '$WTII', '$oxy', '$DYAI', '$DSS', '$ZKIN', '$MFH', '$WKEY', '$MKGI', '$DLPN', '$PSWW', '$SNOW', '$ALYA', '$AESE', '$CSCW', '$CIDM', '$HOFV.', '$LIVX', '$FNKO', '$HPR', '$BRQS', '$GIGM', '$APOP', '$EA', '$CUEN', '$TMBR?', '$FLNT,', '$APPS', '$METX', '$STG', '$WSRC', '$AMHC', '$VIAC', '$MO', '$UAVL', '$CS', '$MDT', '$GYST', '$CBBT', '$ASTC', '$AACG', '$WAFU.', '$WAFU', '$CASI', '$mmmw', '$MVIS', '$SNOA', '$C', '$KR', '$EWZ', '$VALE', '$EWZ.', '$CSCO', '$PINS', '$XSPA', '$VPRX', '$CEMI', '$M', '$BMRA', '$SPX', '$akt', '$SURG', '$NCLH', '$ARSN', '$ODT', '$SGBX', '$CRWD.', '$TGRR', '$PENN', '$BB', '$XOP', '$XL', '$FREQ', '$IDRA', '$DKNG', '$COHN', '$ADHC', '$ISWH', '$LEGO', '$OTRA', '$NAAC', '$HCAR', '$PPGH', '$SDAC', '$PNTM', '$OUST', '$IO', '$HQGE', '$HENC', '$KYNC', '$ATNF', '$BNSO', '$HDSN', '$AABB', '$SGH', '$BMY', '$VERY', '$EARS', '$ROKU', '$PIXY', '$APRE', '$SFET', '$SQ', '$EEIQ', '$REDU', '$CNWT', '$NFLX', '$RGBPP', '$RGBP', '$SHOP', '$VITL', '$RAAS', '$CPNG', '$JKS', '$COMP', '$NAFS']
You can use regular expressions.
\$[a-zA-Z]+
After reading the df execute the below code
import re
# Create Empty list for final results
results = []
final_results = []
for row_num in range(len(df['TWEET'])):
string_to_check = df['TWEET'][row_num]
# Check for RT at the beginning of the string only.
# if 'RT' in df["TWEET"][row_num] would have found the "RT" anywhere in the string.
if re.match(r"^RT", string_to_check):
continue
else:
# Check for all words starting with $ and followed by only alphabets.
# This will find $FOOBAR but not $600, $6FOOBAR & $FOO6BAR
rel_text_l = re.findall(r"\$[a-zA-Z]+", string_to_check)
# Check for empty list
if rel_text_l:
# Add elements of list to another list directly
results.extend(rel_text_l)
# Making list of the set of list to remove duplicates
final_results = list(set(results))
print(results)
print(final_results)
The results are
['$GME', '$FOOBAR', '$FOO', '$SNDL', '$FUBO', '$AMC', '$GME', '$LOTZ', '$CLOV', '$USAS', '$GOBLIN', '$LTNC']
['$LTNC', '$GOBLIN', '$AMC', '$FOO', '$FOOBAR', '$LOTZ', '$CLOV', '$SNDL', '$GME', '$USAS', '$FUBO']
Notice that $GME is removed once in final_results
If you were not bothered about remove tweets starting with RT, all this could be achieved in one line of code.
direct_result = list(set(re.findall(r"\$[a-zA-Z]+", str(df['TWEET']))))

Fastest way to count non spacing chars in Unicode text in Python

Given the Unicode non spacing marks list - https://www.fileformat.info/info/unicode/category/Mn/list.htm
UNICODE_NSM = ['\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030A', '\u030B', '\u030C', '\u030D', '\u030E', '\u030F', '\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031B', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032A', '\u032B', '\u032C', '\u032D', '\u032E', '\u032F', '\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D', '\u033E', '\u033F', '\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u035F', '\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F', '\u0483', '\u0484', '\u0485', '\u0486', '\u0487', '\u0591', '\u0592', '\u0593', '\u0594', '\u0595', '\u0596', '\u0597', '\u0598', '\u0599', '\u059A', '\u059B', '\u059C', '\u059D', '\u059E', '\u059F', '\u05A0', '\u05A1', '\u05A2', '\u05A3', '\u05A4', '\u05A5', '\u05A6', '\u05A7', '\u05A8', '\u05A9', '\u05AA', '\u05AB', '\u05AC', '\u05AD', '\u05AE', '\u05AF', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC', '\u05BD', '\u05BF', '\u05C1', '\u05C2', '\u05C4', '\u05C5', '\u05C7', '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D', '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E7', '\u06E8', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u0711', '\u0730', '\u0731', '\u0732', '\u0733', '\u0734', '\u0735', '\u0736', '\u0737', '\u0738', '\u0739', '\u073A', '\u073B', '\u073C', '\u073D', '\u073E', '\u073F', '\u0740', '\u0741', '\u0742', '\u0743', '\u0744', '\u0745', '\u0746', '\u0747', '\u0748', '\u0749', '\u074A', '\u07A6', '\u07A7', '\u07A8', '\u07A9', '\u07AA', '\u07AB', '\u07AC', '\u07AD', '\u07AE', '\u07AF', '\u07B0', '\u07EB', '\u07EC', '\u07ED', '\u07EE', '\u07EF', '\u07F0', '\u07F1', '\u07F2', '\u07F3', '\u0816', '\u0817', '\u0818', '\u0819', '\u081B', '\u081C', '\u081D', '\u081E', '\u081F', '\u0820', '\u0821', '\u0822', '\u0823', '\u0825', '\u0826', '\u0827', '\u0829', '\u082A', '\u082B', '\u082C', '\u082D', '\u0859', '\u085A', '\u085B', '\u08E4', '\u08E5', '\u08E6', '\u08E7', '\u08E8', '\u08E9', '\u08EA', '\u08EB', '\u08EC', '\u08ED', '\u08EE', '\u08EF', '\u08F0', '\u08F1', '\u08F2', '\u08F3', '\u08F4', '\u08F5', '\u08F6', '\u08F7', '\u08F8', '\u08F9', '\u08FA', '\u08FB', '\u08FC', '\u08FD', '\u08FE', '\u0900', '\u0901', '\u0902', '\u093A', '\u093C', '\u093E', '\u0941', '\u0942', '\u0943', '\u0944', '\u0945', '\u0946', '\u0947', '\u0948', '\u094D', '\u0951', '\u0952', '\u0953', '\u0954', '\u0955', '\u0956', '\u0957', '\u0962', '\u0963', '\u0981', '\u09BC', '\u09C1', '\u09C2', '\u09C3', '\u09C4', '\u09CD', '\u09E2', '\u09E3', '\u0A01', '\u0A02', '\u0A3C', '\u0A41', '\u0A42', '\u0A47', '\u0A48', '\u0A4B', '\u0A4C', '\u0A4D', '\u0A51', '\u0A70', '\u0A71', '\u0A75', '\u0A81', '\u0A82', '\u0ABC', '\u0AC1', '\u0AC2', '\u0AC3', '\u0AC4', '\u0AC5', '\u0AC7', '\u0AC8', '\u0ACD', '\u0AE2', '\u0AE3', '\u0B01', '\u0B3C', '\u0B3F', '\u0B41', '\u0B42', '\u0B43', '\u0B44', '\u0B4D', '\u0B56', '\u0B62', '\u0B63', '\u0B82', '\u0BC0', '\u0BCD', '\u0C3E', '\u0C3F', '\u0C40', '\u0C46', '\u0C47', '\u0C48', '\u0C4A', '\u0C4B', '\u0C4C', '\u0C4D', '\u0C55', '\u0C56', '\u0C62', '\u0C63', '\u0CBC', '\u0CBF', '\u0CC6', '\u0CCC', '\u0CCD', '\u0CE2', '\u0CE3', '\u0D41', '\u0D42', '\u0D43', '\u0D44', '\u0D4D', '\u0D62', '\u0D63', '\u0DCA', '\u0DD2', '\u0DD3', '\u0DD4', '\u0DD6', '\u0E31', '\u0E34', '\u0E35', '\u0E36', '\u0E37', '\u0E38', '\u0E39', '\u0E3A', '\u0E47', '\u0E48', '\u0E49', '\u0E4A', '\u0E4B', '\u0E4C', '\u0E4D', '\u0E4E', '\u0EB1', '\u0EB4', '\u0EB5', '\u0EB6', '\u0EB7', '\u0EB8', '\u0EB9', '\u0EBB', '\u0EBC', '\u0EC8', '\u0EC9', '\u0ECA', '\u0ECB', '\u0ECC', '\u0ECD', '\u0F18', '\u0F19', '\u0F35', '\u0F37', '\u0F39', '\u0F71', '\u0F72', '\u0F73', '\u0F74', '\u0F75', '\u0F76', '\u0F77', '\u0F78', '\u0F79', '\u0F7A', '\u0F7B', '\u0F7C', '\u0F7D', '\u0F7E', '\u0F80', '\u0F81', '\u0F82', '\u0F83', '\u0F84', '\u0F86', '\u0F87', '\u0F8D', '\u0F8E', '\u0F8F', '\u0F90', '\u0F91', '\u0F92', '\u0F93', '\u0F94', '\u0F95', '\u0F96', '\u0F97', '\u0F99', '\u0F9A', '\u0F9B', '\u0F9C', '\u0F9D', '\u0F9E', '\u0F9F', '\u0FA0', '\u0FA1', '\u0FA2', '\u0FA3', '\u0FA4', '\u0FA5', '\u0FA6', '\u0FA7', '\u0FA8', '\u0FA9', '\u0FAA', '\u0FAB', '\u0FAC', '\u0FAD', '\u0FAE', '\u0FAF', '\u0FB0', '\u0FB1', '\u0FB2', '\u0FB3', '\u0FB4', '\u0FB5', '\u0FB6', '\u0FB7', '\u0FB8', '\u0FB9', '\u0FBA', '\u0FBB', '\u0FBC', '\u0FC6', '\u102D', '\u102E', '\u102F', '\u1030', '\u1032', '\u1033', '\u1034', '\u1035', '\u1036', '\u1037', '\u1039', '\u103A', '\u103D', '\u103E', '\u1058', '\u1059', '\u105E', '\u105F', '\u1060', '\u1071', '\u1072', '\u1073', '\u1074', '\u1082', '\u1085', '\u1086', '\u108D', '\u109D', '\u135D', '\u135E', '\u135F', '\u1712', '\u1713', '\u1714', '\u1732', '\u1733', '\u1734', '\u1752', '\u1753', '\u1772', '\u1773', '\u17B4', '\u17B5', '\u17B7', '\u17B8', '\u17B9', '\u17BA', '\u17BB', '\u17BC', '\u17BD', '\u17C6', '\u17C9', '\u17CA', '\u17CB', '\u17CC', '\u17CD', '\u17CE', '\u17CF', '\u17D0', '\u17D1', '\u17D2', '\u17D3', '\u17DD', '\u180B', '\u180C', '\u180D', '\u18A9', '\u1920', '\u1921', '\u1922', '\u1927', '\u1928', '\u1932', '\u1939', '\u193A', '\u193B', '\u1A17', '\u1A18', '\u1A56', '\u1A58', '\u1A59', '\u1A5A', '\u1A5B', '\u1A5C', '\u1A5D', '\u1A5E', '\u1A60', '\u1A62', '\u1A65', '\u1A66', '\u1A67', '\u1A68', '\u1A69', '\u1A6A', '\u1A6B', '\u1A6C', '\u1A73', '\u1A74', '\u1A75', '\u1A76', '\u1A77', '\u1A78', '\u1A79', '\u1A7A', '\u1A7B', '\u1A7C', '\u1A7F', '\u1B00', '\u1B01', '\u1B02', '\u1B03', '\u1B34', '\u1B36', '\u1B37', '\u1B38', '\u1B39', '\u1B3A', '\u1B3C', '\u1B42', '\u1B6B', '\u1B6C', '\u1B6D', '\u1B6E', '\u1B6F', '\u1B70', '\u1B71', '\u1B72', '\u1B73', '\u1B80', '\u1B81', '\u1BA2', '\u1BA3', '\u1BA4', '\u1BA5', '\u1BA8', '\u1BA9', '\u1BAB', '\u1BE6', '\u1BE8', '\u1BE9', '\u1BED', '\u1BEF', '\u1BF0', '\u1BF1', '\u1C2C', '\u1C2D', '\u1C2E', '\u1C2F', '\u1C30', '\u1C31', '\u1C32', '\u1C33', '\u1C36', '\u1C37', '\u1CD0', '\u1CD1', '\u1CD2', '\u1CD4', '\u1CD5', '\u1CD6', '\u1CD7', '\u1CD8', '\u1CD9', '\u1CDA', '\u1CDB', '\u1CDC', '\u1CDD', '\u1CDE', '\u1CDF', '\u1CE0', '\u1CE2', '\u1CE3', '\u1CE4', '\u1CE5', '\u1CE6', '\u1CE7', '\u1CE8', '\u1CED', '\u1CF4', '\u1DC0', '\u1DC1', '\u1DC2', '\u1DC3', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCB', '\u1DCC', '\u1DCD', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5', '\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1', '\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\u1DFC', '\u1DFD', '\u1DFE', '\u1DFF', '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DB', '\u20DC', '\u20E1', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9', '\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EE', '\u20EF', '\u20F0', '\u2CEF', '\u2CF0', '\u2CF1', '\u2D7F', '\u2DE0', '\u2DE1', '\u2DE2', '\u2DE3', '\u2DE4', '\u2DE5', '\u2DE6', '\u2DE7', '\u2DE8', '\u2DE9', '\u2DEA', '\u2DEB', '\u2DEC', '\u2DED', '\u2DEE', '\u2DEF', '\u2DF0', '\u2DF1', '\u2DF2', '\u2DF3', '\u2DF4', '\u2DF5', '\u2DF6', '\u2DF7', '\u2DF8', '\u2DF9', '\u2DFA', '\u2DFB', '\u2DFC', '\u2DFD', '\u2DFE', '\u2DFF', '\u302A', '\u302B', '\u302C', '\u302D', '\u3099', '\u309A', '\uA66F', '\uA674', '\uA675', '\uA676', '\uA677', '\uA678', '\uA679', '\uA67A', '\uA67B', '\uA67C', '\uA67D', '\uA69F', '\uA6F0', '\uA6F1', '\uA802', '\uA806', '\uA80B', '\uA825', '\uA826', '\uA8C4', '\uA8E0', '\uA8E1', '\uA8E2', '\uA8E3', '\uA8E4', '\uA8E5', '\uA8E6', '\uA8E7', '\uA8E8', '\uA8E9', '\uA8EA', '\uA8EB', '\uA8EC', '\uA8ED', '\uA8EE', '\uA8EF', '\uA8F0', '\uA8F1', '\uA926', '\uA927', '\uA928', '\uA929', '\uA92A', '\uA92B', '\uA92C', '\uA92D', '\uA947', '\uA948', '\uA949', '\uA94A', '\uA94B', '\uA94C', '\uA94D', '\uA94E', '\uA94F', '\uA950', '\uA951', '\uA980', '\uA981', '\uA982', '\uA9B3', '\uA9B6', '\uA9B7', '\uA9B8', '\uA9B9', '\uA9BC', '\uAA29', '\uAA2A', '\uAA2B', '\uAA2C', '\uAA2D', '\uAA2E', '\uAA31', '\uAA32', '\uAA35', '\uAA36', '\uAA43', '\uAA4C', '\uAAB0', '\uAAB2', '\uAAB3', '\uAAB4', '\uAAB7', '\uAAB8', '\uAABE', '\uAABF', '\uAAC1', '\uAAEC', '\uAAED', '\uAAF6', '\uABE5', '\uABE8', '\uABED', '\uFB1E', '\uFE00', '\uFE01', '\uFE02', '\uFE03', '\uFE04', '\uFE05', '\uFE06', '\uFE07', '\uFE08', '\uFE09', '\uFE0A', '\uFE0B', '\uFE0C', '\uFE0D', '\uFE0E', '\uFE0F', '\uFE20', '\uFE21', '\uFE22', '\uFE23', '\uFE24', '\uFE25', '\uFE26', '\U000101FD', '\U00010A01', '\U00010A02', '\U00010A03', '\U00010A05', '\U00010A06', '\U00010A0C', '\U00010A0D', '\U00010A0E', '\U00010A0F', '\U00010A38', '\U00010A39', '\U00010A3A', '\U00010A3F', '\U00011001', '\U00011038', '\U00011039', '\U0001103A', '\U0001103B', '\U0001103C', '\U0001103D', '\U0001103E', '\U0001103F', '\U00011040', '\U00011041', '\U00011042', '\U00011043', '\U00011044', '\U00011045', '\U00011046', '\U00011080', '\U00011081', '\U000110B3', '\U000110B4', '\U000110B5', '\U000110B6', '\U000110B9', '\U000110BA', '\U00011100', '\U00011101', '\U00011102', '\U00011127', '\U00011128', '\U00011129', '\U0001112A', '\U0001112B', '\U0001112D', '\U0001112E', '\U0001112F', '\U00011130', '\U00011131', '\U00011132', '\U00011133', '\U00011134', '\U00011180', '\U00011181', '\U000111B6', '\U000111B7', '\U000111B8', '\U000111B9', '\U000111BA', '\U000111BB', '\U000111BC', '\U000111BD', '\U000111BE', '\U000116AB', '\U000116AD', '\U000116B0', '\U000116B1', '\U000116B2', '\U000116B3', '\U000116B4', '\U000116B5', '\U000116B7', '\U00016F8F', '\U00016F90', '\U00016F91', '\U00016F92', '\U0001D167', '\U0001D168', '\U0001D169', '\U0001D17B', '\U0001D17C', '\U0001D17D', '\U0001D17E', '\U0001D17F', '\U0001D180', '\U0001D181', '\U0001D182', '\U0001D185', '\U0001D186', '\U0001D187', '\U0001D188', '\U0001D189', '\U0001D18A', '\U0001D18B', '\U0001D1AA', '\U0001D1AB', '\U0001D1AC', '\U0001D1AD', '\U0001D242', '\U0001D243', '\U0001D244', '\U000E0100', '\U000E0101', '\U000E0102', '\U000E0103', '\U000E0104', '\U000E0105', '\U000E0106', '\U000E0107', '\U000E0108', '\U000E0109', '\U000E010A', '\U000E010B', '\U000E010C', '\U000E010D', '\U000E010E', '\U000E010F', '\U000E0110', '\U000E0111', '\U000E0112', '\U000E0113', '\U000E0114', '\U000E0115', '\U000E0116', '\U000E0117', '\U000E0118', '\U000E0119', '\U000E011A', '\U000E011B', '\U000E011C', '\U000E011D', '\U000E011E', '\U000E011F', '\U000E0120', '\U000E0121', '\U000E0122', '\U000E0123', '\U000E0124', '\U000E0125', '\U000E0126', '\U000E0127', '\U000E0128', '\U000E0129', '\U000E012A', '\U000E012B', '\uE012C', '\U000E012D', '\U000E012E', '\U000E012F', '\U000E0130', '\U000E0131', '\U000E0132', '\U000E0133', '\U000E0134', '\U000E0135', '\U000E0136', '\U000E0137', '\U000E0138', '\U000E0139', '\U000E013A', '\U000E013B', '\U000E013C', '\U000E013D', '\U000E013E', '\U000E013F', '\U000E0140', '\U000E0141', '\U000E0142', '\U000E0143', '\U000E0144', '\U000E0145', '\U000E0146', '\U000E0147', '\U000E0148', '\U000E0149', '\U000E014A', '\U000E014B', '\U000E014C', '\U000E014D', '\U000E014E', '\U000E014F', '\U000E0150', '\U000E0151', '\U000E0152', '\U000E0153', '\U000E0154', '\U000E0155', '\U000E0156', '\U000E0157', '\U000E0158', '\U000E0159', '\U000E015A', '\U000E015B', '\U000E015C', '\U000E015D', '\U000E015E', '\U000E015F', '\U000E0160', '\U000E0161', '\U000E0162', '\U000E0163', '\U000E0164', '\U000E0165', '\U000E0166', '\U000E0167', '\U000E0168', '\U000E0169', '\U000E016A', '\U000E016B', '\U000E016C', '\U000E016D', '\U000E016E', '\U000E016F', '\U000E0170', '\U000E0171', '\U000E0172', '\U000E0173', '\U000E0174', '\U000E0175', '\U000E0176', '\U000E0177', '\U000E0178', '\U000E0179', '\U000E017A', '\U000E017B', '\U000E017C', '\U000E017D', '\U000E017E', '\U000E017F', '\U000E0180', '\U000E0181', '\U000E0182', '\U000E0183', '\U000E0184', '\U000E0185', '\uE0186', '\U000E0187', '\U000E0188', '\U000E0189', '\U000E018A', '\U000E018B', '\U000E018C', '\U000E018D', '\U000E018E', '\U000E018F', '\U000E0190', '\U000E0191', '\U000E0192', '\U000E0193', '\U000E0194', '\U000E0195', '\U000E0196', '\U000E0197', '\U000E0198', '\U000E0199', '\U000E019A', '\U000E019B', '\U000E019C', '\U000E019D', '\U000E019E', '\U000E019F', '\U000E01A0', '\U000E01A1', '\U000E01A2', '\U000E01A3', '\U000E01A4', '\U000E01A5', '\U000E01A6', '\U000E01A7', '\U000E01A8', '\U000E01A9', '\U000E01AA', '\U000E01AB', '\U000E01AC', '\U000E01AD', '\U000E01AE', '\U000E01AF', '\U000E01B0', '\U000E01B1', '\U000E01B2', '\U000E01B3', '\U000E01B4', '\U000E01B5', '\U000E01B6', '\U000E01B7', '\U000E01B8', '\U000E01B9', '\U000E01BA', '\U000E01BB', '\U000E01BC', '\U000E01BD', '\U000E01BE', '\U000E01BF', '\U000E01C0', '\U000E01C1', '\U000E01C2', '\U000E01C3', '\U000E01C4', '\U000E01C5', '\U000E01C6', '\U000E01C7', '\U000E01C8', '\U000E01C9', '\U000E01CA', '\U000E01CB', '\U000E01CC', '\U000E01CD', '\U000E01CE', '\U000E01CF', '\U000E01D0', '\U000E01D1', '\U000E01D2', '\U000E01D3', '\U000E01D4', '\U000E01D5', '\U000E01D6', '\U000E01D7', '\U000E01D8', '\U000E01D9', '\U000E01DA', '\U000E01DB', '\U000E01DC', '\U000E01DD', '\U000E01DE', '\U000E01DF', '\U000E01E0', '\U000E01E1', '\U000E01E2', '\U000E01E3', '\U000E01E4', '\U000E01E5', '\U000E01E6', '\U000E01E7', '\U000E01E8', '\U000E01E9', '\U000E01EA', '\U000E01EB', '\U000E01EC', '\U000E01ED', '\U000E01EE', '\U000E01EF'];
NOTE.
Please note that we have both \U000XXXXX and \uXXXX representations here.
I want to count the Unicode input text like this Hindi string "अब यहां से कहा जाएँ हम" or just a token word like "समझा", excluding the non spacing characters.
My implementation looks like
def countNonSpacingCharString(str):
count = 0;
for char in str:
if char not in UNICODE_NSM:
count = count + 1
return count
Thanks to the help provided in the answers below I have put all together in this github. There is also a mark codepoints list ready to be used in JavaScript / Node.js - https://github.com/loretoparisi/unicode_marks
Fastest way I came up with. len was slightly faster than sum. I built a set of all combining mark types in the setup.
test.py:
import sys
from unicodedata import category
MARK_SET = set(chr(c) for c in range(sys.maxunicode + 1) if category(chr(c))[0] == 'M')
s = "अब यहां से कहा जाएँ हम"
def count_len(s):
return len([c for c in s if c not in MARK_SET])
def count_sum(s):
return sum([c not in MARK_SET for c in s])
if __name__ == '__main__':
print(len(s))
print(count_len(s))
print(count_sum(s))
Output:
22
16
16
Timings:
C:\>py -m timeit -s "from test import count_sum,s" "count_sum(s)"
50000 loops, best of 5: 4.62 usec per loop
C:\>py -m timeit -s "from test import count_len,s" "count_len(s)"
50000 loops, best of 5: 3.97 usec per loop
It's worth noting that there is a grapheme 3rd party library. grapheme.length(s) == 16, but it was much slower (118us). The full grapheme-detecting algorithm is more complicated than skipping the modifier category. Consider the combining emojis for families and skin colors.
See also Unicode Text Segmentation.
This might be a better alternative:
def countNonSpacingCharString(str):
return len([char for char in str if not(char in UNICODE_NSM)])
How about using a dictionary to look up the values and if not present, increment the count? It should be faster than the former approach because the time complexity to check the presence of the character reduces to O(1).
The implementation should look somewhat like this:
Create a dict and populate it:
lookup_dict = {}
for alpha in UNICODE_NSM:
lookup_dict[alpha] = 1
Look it up while looping through the string:
def countNonSpacingCharString(str):
count = 0;
for char in str:
start_time = time.time()
if not lookup_dict.get(char):
count = count + 1
print("--- %s seconds ---" % (time.time() - start_time))
return count
I must note that using str, as variable name in Python is bad idea, as it is name of built-in function. Anyway I would implement your function following way:
def countNonSpacingCharString(s):
return len(filter(lambda x:not x in UNICODE_NSM,s))
in Python 2
def countNonSpacingCharString(s):
return sum(1 for _ in filter(lambda x:not x in UNICODE_NSM,s))
in Python 3
Inspecting my function using dis.dis showed that it produced less bytecode than your version with count, thus suggesting it might be faster, though this need further investigation.
EDIT: I tested my code in Python 2, but not Python 3 - version for Python 3 added, using Mohammad Banisaeid answer from this topic.
EDIT 2: If you uses UNICODE_NSM only for that, you might try to use set instead of list, which should boost in operator, though again this need further investigation. For discussion about list vs set performance see this thread.
Perhaps the easiest way to do this is to use the unicodedata module. In part, because it will be more rigorously tested. Indeed, I found your list appeared to be including categories other than Mn. That is, it includes Unicode points from Mc (Mark, spacing combining) as well, but you said you only wanted to exclude Unicode points from Mn (Mark, Nonspacing).
eg.
import unicodedata
def countNonSpacingCharString(string):
category = unicodedata.category
return sum(category(char) != 'Mn' for char in string)
This appears to be about 60 times faster according to timeit.
You might get a TypeError, if your version of Python and therefore unicodedata is not up-to-date, and so not aware of recent additions to Unicode. You can get around this by installing unicodedata2 and using that instead.
From your comments it looks like you're really after counting "user perceived characters". This is a complicated process with a number of edge cases. If you can then you should to install regex on your environment (that would be micropython?). You can then do:
>>> parts = regex.findall(r'\X', 'अब यहां से कहा जाएँ हम')
>>> parts
['अ', 'ब', ' ', 'य', 'हां', ' ', 'से', ' ', 'क', 'हा', ' ', 'जा', 'एँ', ' ', 'ह', 'म']
>>> len(parts)
16
Which splits your string into "user perceived characters", and then you can work on this list of strings to get what you need.
Failing that, your current solution of just ignoring Mark code points is an 80/20 solution (gets you most of the way their for the least amount of effort). You will have to revise what your list of Unicode marks though. My tests showed that your list was missing 113 code points across all the Indo-European and Dravidian scripts in Unicode (Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam, and Sinhala).
I extracted these characters by downloading and parsing: https://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt with the following code:
indian_script_range = range(0x0900, 0x0E00) # doesn't include all indic scripts (eg. Thai)
basic_multilingual_plane = range(0x0000, 0x10000)
# use the latter if you want to be more thorough and include all indic scripts and non-indic scripts
codepoint_range = indian_script_range
codepoints = []
with open('UnicodeData.txt') as f:
for line in f:
hex_string, name, category, *rest = line.strip().split(';')
codepoint_number = int(hex_string, base=16)
if (
category in ('Mn', 'Mc', 'Me')
and (
codepoint_number in codepoint_range
or name.startswith('VARIATION SELECTOR') # you seemed to want to include these
)
):
codepoints.append(chr(codepoint_number))
missing = set(codepoints) - set(UNICODE_NSM)
Mark Tolonens answer is the fastest, because it uses a set for comparison. If you have a text of length n and m whitespace-characters to compare with, then your worst-case runtime using two lists is O(nm). Using a set for the whitespace characters reduces that to O(n).
Using unicodedata.category is just nicer because it is shorter and less prone to human error.
Performance comparison
You can clearly see that the markset_count and the category_count are way faster than the generator_count and the loop_count. Also the speed of the latter two varies way more. Interestingly, the generator_count is slower than the loop_count.
The markset_count is a bit faster than the category_count. I think that is the case because looking up the category and doing the string comparison also takes a bit of time. The difference is way more clear when you only plot the two and increase the text length:
import timeit
import sys
import unicodedata
import numpy as np
UNICODE_NSM = ['\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030A', '\u030B', '\u030C', '\u030D', '\u030E', '\u030F', '\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031B', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032A', '\u032B', '\u032C', '\u032D', '\u032E', '\u032F', '\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D', '\u033E', '\u033F', '\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u035F', '\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F', '\u0483', '\u0484', '\u0485', '\u0486', '\u0487', '\u0591', '\u0592', '\u0593', '\u0594', '\u0595', '\u0596', '\u0597', '\u0598', '\u0599', '\u059A', '\u059B', '\u059C', '\u059D', '\u059E', '\u059F', '\u05A0', '\u05A1', '\u05A2', '\u05A3', '\u05A4', '\u05A5', '\u05A6', '\u05A7', '\u05A8', '\u05A9', '\u05AA', '\u05AB', '\u05AC', '\u05AD', '\u05AE', '\u05AF', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC', '\u05BD', '\u05BF', '\u05C1', '\u05C2', '\u05C4', '\u05C5', '\u05C7', '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D', '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E7', '\u06E8', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u0711', '\u0730', '\u0731', '\u0732', '\u0733', '\u0734', '\u0735', '\u0736', '\u0737', '\u0738', '\u0739', '\u073A', '\u073B', '\u073C', '\u073D', '\u073E', '\u073F', '\u0740', '\u0741', '\u0742', '\u0743', '\u0744', '\u0745', '\u0746', '\u0747', '\u0748', '\u0749', '\u074A', '\u07A6', '\u07A7', '\u07A8', '\u07A9', '\u07AA', '\u07AB', '\u07AC', '\u07AD', '\u07AE', '\u07AF', '\u07B0', '\u07EB', '\u07EC', '\u07ED', '\u07EE', '\u07EF', '\u07F0', '\u07F1', '\u07F2', '\u07F3', '\u0816', '\u0817', '\u0818', '\u0819', '\u081B', '\u081C', '\u081D', '\u081E', '\u081F', '\u0820', '\u0821', '\u0822', '\u0823', '\u0825', '\u0826', '\u0827', '\u0829', '\u082A', '\u082B', '\u082C', '\u082D', '\u0859', '\u085A', '\u085B', '\u08E4', '\u08E5', '\u08E6', '\u08E7', '\u08E8', '\u08E9', '\u08EA', '\u08EB', '\u08EC', '\u08ED', '\u08EE', '\u08EF', '\u08F0', '\u08F1', '\u08F2', '\u08F3', '\u08F4', '\u08F5', '\u08F6', '\u08F7', '\u08F8', '\u08F9', '\u08FA', '\u08FB', '\u08FC', '\u08FD', '\u08FE', '\u0900', '\u0901', '\u0902', '\u093A', '\u093C', '\u093E', '\u0941', '\u0942', '\u0943', '\u0944', '\u0945', '\u0946', '\u0947', '\u0948', '\u094D', '\u0951', '\u0952', '\u0953', '\u0954', '\u0955', '\u0956', '\u0957', '\u0962', '\u0963', '\u0981', '\u09BC', '\u09C1', '\u09C2', '\u09C3', '\u09C4', '\u09CD', '\u09E2', '\u09E3', '\u0A01', '\u0A02', '\u0A3C', '\u0A41', '\u0A42', '\u0A47', '\u0A48', '\u0A4B', '\u0A4C', '\u0A4D', '\u0A51', '\u0A70', '\u0A71', '\u0A75', '\u0A81', '\u0A82', '\u0ABC', '\u0AC1', '\u0AC2', '\u0AC3', '\u0AC4', '\u0AC5', '\u0AC7', '\u0AC8', '\u0ACD', '\u0AE2', '\u0AE3', '\u0B01', '\u0B3C', '\u0B3F', '\u0B41', '\u0B42', '\u0B43', '\u0B44', '\u0B4D', '\u0B56', '\u0B62', '\u0B63', '\u0B82', '\u0BC0', '\u0BCD', '\u0C3E', '\u0C3F', '\u0C40', '\u0C46', '\u0C47', '\u0C48', '\u0C4A', '\u0C4B', '\u0C4C', '\u0C4D', '\u0C55', '\u0C56', '\u0C62', '\u0C63', '\u0CBC', '\u0CBF', '\u0CC6', '\u0CCC', '\u0CCD', '\u0CE2', '\u0CE3', '\u0D41', '\u0D42', '\u0D43', '\u0D44', '\u0D4D', '\u0D62', '\u0D63', '\u0DCA', '\u0DD2', '\u0DD3', '\u0DD4', '\u0DD6', '\u0E31', '\u0E34', '\u0E35', '\u0E36', '\u0E37', '\u0E38', '\u0E39', '\u0E3A', '\u0E47', '\u0E48', '\u0E49', '\u0E4A', '\u0E4B', '\u0E4C', '\u0E4D', '\u0E4E', '\u0EB1', '\u0EB4', '\u0EB5', '\u0EB6', '\u0EB7', '\u0EB8', '\u0EB9', '\u0EBB', '\u0EBC', '\u0EC8', '\u0EC9', '\u0ECA', '\u0ECB', '\u0ECC', '\u0ECD', '\u0F18', '\u0F19', '\u0F35', '\u0F37', '\u0F39', '\u0F71', '\u0F72', '\u0F73', '\u0F74', '\u0F75', '\u0F76', '\u0F77', '\u0F78', '\u0F79', '\u0F7A', '\u0F7B', '\u0F7C', '\u0F7D', '\u0F7E', '\u0F80', '\u0F81', '\u0F82', '\u0F83', '\u0F84', '\u0F86', '\u0F87', '\u0F8D', '\u0F8E', '\u0F8F', '\u0F90', '\u0F91', '\u0F92', '\u0F93', '\u0F94', '\u0F95', '\u0F96', '\u0F97', '\u0F99', '\u0F9A', '\u0F9B', '\u0F9C', '\u0F9D', '\u0F9E', '\u0F9F', '\u0FA0', '\u0FA1', '\u0FA2', '\u0FA3', '\u0FA4', '\u0FA5', '\u0FA6', '\u0FA7', '\u0FA8', '\u0FA9', '\u0FAA', '\u0FAB', '\u0FAC', '\u0FAD', '\u0FAE', '\u0FAF', '\u0FB0', '\u0FB1', '\u0FB2', '\u0FB3', '\u0FB4', '\u0FB5', '\u0FB6', '\u0FB7', '\u0FB8', '\u0FB9', '\u0FBA', '\u0FBB', '\u0FBC', '\u0FC6', '\u102D', '\u102E', '\u102F', '\u1030', '\u1032', '\u1033', '\u1034', '\u1035', '\u1036', '\u1037', '\u1039', '\u103A', '\u103D', '\u103E', '\u1058', '\u1059', '\u105E', '\u105F', '\u1060', '\u1071', '\u1072', '\u1073', '\u1074', '\u1082', '\u1085', '\u1086', '\u108D', '\u109D', '\u135D', '\u135E', '\u135F', '\u1712', '\u1713', '\u1714', '\u1732', '\u1733', '\u1734', '\u1752', '\u1753', '\u1772', '\u1773', '\u17B4', '\u17B5', '\u17B7', '\u17B8', '\u17B9', '\u17BA', '\u17BB', '\u17BC', '\u17BD', '\u17C6', '\u17C9', '\u17CA', '\u17CB', '\u17CC', '\u17CD', '\u17CE', '\u17CF', '\u17D0', '\u17D1', '\u17D2', '\u17D3', '\u17DD', '\u180B', '\u180C', '\u180D', '\u18A9', '\u1920', '\u1921', '\u1922', '\u1927', '\u1928', '\u1932', '\u1939', '\u193A', '\u193B', '\u1A17', '\u1A18', '\u1A56', '\u1A58', '\u1A59', '\u1A5A', '\u1A5B', '\u1A5C', '\u1A5D', '\u1A5E', '\u1A60', '\u1A62', '\u1A65', '\u1A66', '\u1A67', '\u1A68', '\u1A69', '\u1A6A', '\u1A6B', '\u1A6C', '\u1A73', '\u1A74', '\u1A75', '\u1A76', '\u1A77', '\u1A78', '\u1A79', '\u1A7A', '\u1A7B', '\u1A7C', '\u1A7F', '\u1B00', '\u1B01', '\u1B02', '\u1B03', '\u1B34', '\u1B36', '\u1B37', '\u1B38', '\u1B39', '\u1B3A', '\u1B3C', '\u1B42', '\u1B6B', '\u1B6C', '\u1B6D', '\u1B6E', '\u1B6F', '\u1B70', '\u1B71', '\u1B72', '\u1B73', '\u1B80', '\u1B81', '\u1BA2', '\u1BA3', '\u1BA4', '\u1BA5', '\u1BA8', '\u1BA9', '\u1BAB', '\u1BE6', '\u1BE8', '\u1BE9', '\u1BED', '\u1BEF', '\u1BF0', '\u1BF1', '\u1C2C', '\u1C2D', '\u1C2E', '\u1C2F', '\u1C30', '\u1C31', '\u1C32', '\u1C33', '\u1C36', '\u1C37', '\u1CD0', '\u1CD1', '\u1CD2', '\u1CD4', '\u1CD5', '\u1CD6', '\u1CD7', '\u1CD8', '\u1CD9', '\u1CDA', '\u1CDB', '\u1CDC', '\u1CDD', '\u1CDE', '\u1CDF', '\u1CE0', '\u1CE2', '\u1CE3', '\u1CE4', '\u1CE5', '\u1CE6', '\u1CE7', '\u1CE8', '\u1CED', '\u1CF4', '\u1DC0', '\u1DC1', '\u1DC2', '\u1DC3', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCB', '\u1DCC', '\u1DCD', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5', '\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1', '\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\u1DFC', '\u1DFD', '\u1DFE', '\u1DFF', '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DB', '\u20DC', '\u20E1', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9', '\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EE', '\u20EF', '\u20F0', '\u2CEF', '\u2CF0', '\u2CF1', '\u2D7F', '\u2DE0', '\u2DE1', '\u2DE2', '\u2DE3', '\u2DE4', '\u2DE5', '\u2DE6', '\u2DE7', '\u2DE8', '\u2DE9', '\u2DEA', '\u2DEB', '\u2DEC', '\u2DED', '\u2DEE', '\u2DEF', '\u2DF0', '\u2DF1', '\u2DF2', '\u2DF3', '\u2DF4', '\u2DF5', '\u2DF6', '\u2DF7', '\u2DF8', '\u2DF9', '\u2DFA', '\u2DFB', '\u2DFC', '\u2DFD', '\u2DFE', '\u2DFF', '\u302A', '\u302B', '\u302C', '\u302D', '\u3099', '\u309A', '\uA66F', '\uA674', '\uA675', '\uA676', '\uA677', '\uA678', '\uA679', '\uA67A', '\uA67B', '\uA67C', '\uA67D', '\uA69F', '\uA6F0', '\uA6F1', '\uA802', '\uA806', '\uA80B', '\uA825', '\uA826', '\uA8C4', '\uA8E0', '\uA8E1', '\uA8E2', '\uA8E3', '\uA8E4', '\uA8E5', '\uA8E6', '\uA8E7', '\uA8E8', '\uA8E9', '\uA8EA', '\uA8EB', '\uA8EC', '\uA8ED', '\uA8EE', '\uA8EF', '\uA8F0', '\uA8F1', '\uA926', '\uA927', '\uA928', '\uA929', '\uA92A', '\uA92B', '\uA92C', '\uA92D', '\uA947', '\uA948', '\uA949', '\uA94A', '\uA94B', '\uA94C', '\uA94D', '\uA94E', '\uA94F', '\uA950', '\uA951', '\uA980', '\uA981', '\uA982', '\uA9B3', '\uA9B6', '\uA9B7', '\uA9B8', '\uA9B9', '\uA9BC', '\uAA29', '\uAA2A', '\uAA2B', '\uAA2C', '\uAA2D', '\uAA2E', '\uAA31', '\uAA32', '\uAA35', '\uAA36', '\uAA43', '\uAA4C', '\uAAB0', '\uAAB2', '\uAAB3', '\uAAB4', '\uAAB7', '\uAAB8', '\uAABE', '\uAABF', '\uAAC1', '\uAAEC', '\uAAED', '\uAAF6', '\uABE5', '\uABE8', '\uABED', '\uFB1E', '\uFE00', '\uFE01', '\uFE02', '\uFE03', '\uFE04', '\uFE05', '\uFE06', '\uFE07', '\uFE08', '\uFE09', '\uFE0A', '\uFE0B', '\uFE0C', '\uFE0D', '\uFE0E', '\uFE0F', '\uFE20', '\uFE21', '\uFE22', '\uFE23', '\uFE24', '\uFE25', '\uFE26', '\U000101FD', '\U00010A01', '\U00010A02', '\U00010A03', '\U00010A05', '\U00010A06', '\U00010A0C', '\U00010A0D', '\U00010A0E', '\U00010A0F', '\U00010A38', '\U00010A39', '\U00010A3A', '\U00010A3F', '\U00011001', '\U00011038', '\U00011039', '\U0001103A', '\U0001103B', '\U0001103C', '\U0001103D', '\U0001103E', '\U0001103F', '\U00011040', '\U00011041', '\U00011042', '\U00011043', '\U00011044', '\U00011045', '\U00011046', '\U00011080', '\U00011081', '\U000110B3', '\U000110B4', '\U000110B5', '\U000110B6', '\U000110B9', '\U000110BA', '\U00011100', '\U00011101', '\U00011102', '\U00011127', '\U00011128', '\U00011129', '\U0001112A', '\U0001112B', '\U0001112D', '\U0001112E', '\U0001112F', '\U00011130', '\U00011131', '\U00011132', '\U00011133', '\U00011134', '\U00011180', '\U00011181', '\U000111B6', '\U000111B7', '\U000111B8', '\U000111B9', '\U000111BA', '\U000111BB', '\U000111BC', '\U000111BD', '\U000111BE', '\U000116AB', '\U000116AD', '\U000116B0', '\U000116B1', '\U000116B2', '\U000116B3', '\U000116B4', '\U000116B5', '\U000116B7', '\U00016F8F', '\U00016F90', '\U00016F91', '\U00016F92', '\U0001D167', '\U0001D168', '\U0001D169', '\U0001D17B', '\U0001D17C', '\U0001D17D', '\U0001D17E', '\U0001D17F', '\U0001D180', '\U0001D181', '\U0001D182', '\U0001D185', '\U0001D186', '\U0001D187', '\U0001D188', '\U0001D189', '\U0001D18A', '\U0001D18B', '\U0001D1AA', '\U0001D1AB', '\U0001D1AC', '\U0001D1AD', '\U0001D242', '\U0001D243', '\U0001D244', '\U000E0100', '\U000E0101', '\U000E0102', '\U000E0103', '\U000E0104', '\U000E0105', '\U000E0106', '\U000E0107', '\U000E0108', '\U000E0109', '\U000E010A', '\U000E010B', '\U000E010C', '\U000E010D', '\U000E010E', '\U000E010F', '\U000E0110', '\U000E0111', '\U000E0112', '\U000E0113', '\U000E0114', '\U000E0115', '\U000E0116', '\U000E0117', '\U000E0118', '\U000E0119', '\U000E011A', '\U000E011B', '\U000E011C', '\U000E011D', '\U000E011E', '\U000E011F', '\U000E0120', '\U000E0121', '\U000E0122', '\U000E0123', '\U000E0124', '\U000E0125', '\U000E0126', '\U000E0127', '\U000E0128', '\U000E0129', '\U000E012A', '\U000E012B', '\uE012C', '\U000E012D', '\U000E012E', '\U000E012F', '\U000E0130', '\U000E0131', '\U000E0132', '\U000E0133', '\U000E0134', '\U000E0135', '\U000E0136', '\U000E0137', '\U000E0138', '\U000E0139', '\U000E013A', '\U000E013B', '\U000E013C', '\U000E013D', '\U000E013E', '\U000E013F', '\U000E0140', '\U000E0141', '\U000E0142', '\U000E0143', '\U000E0144', '\U000E0145', '\U000E0146', '\U000E0147', '\U000E0148', '\U000E0149', '\U000E014A', '\U000E014B', '\U000E014C', '\U000E014D', '\U000E014E', '\U000E014F', '\U000E0150', '\U000E0151', '\U000E0152', '\U000E0153', '\U000E0154', '\U000E0155', '\U000E0156', '\U000E0157', '\U000E0158', '\U000E0159', '\U000E015A', '\U000E015B', '\U000E015C', '\U000E015D', '\U000E015E', '\U000E015F', '\U000E0160', '\U000E0161', '\U000E0162', '\U000E0163', '\U000E0164', '\U000E0165', '\U000E0166', '\U000E0167', '\U000E0168', '\U000E0169', '\U000E016A', '\U000E016B', '\U000E016C', '\U000E016D', '\U000E016E', '\U000E016F', '\U000E0170', '\U000E0171', '\U000E0172', '\U000E0173', '\U000E0174', '\U000E0175', '\U000E0176', '\U000E0177', '\U000E0178', '\U000E0179', '\U000E017A', '\U000E017B', '\U000E017C', '\U000E017D', '\U000E017E', '\U000E017F', '\U000E0180', '\U000E0181', '\U000E0182', '\U000E0183', '\U000E0184', '\U000E0185', '\uE0186', '\U000E0187', '\U000E0188', '\U000E0189', '\U000E018A', '\U000E018B', '\U000E018C', '\U000E018D', '\U000E018E', '\U000E018F', '\U000E0190', '\U000E0191', '\U000E0192', '\U000E0193', '\U000E0194', '\U000E0195', '\U000E0196', '\U000E0197', '\U000E0198', '\U000E0199', '\U000E019A', '\U000E019B', '\U000E019C', '\U000E019D', '\U000E019E', '\U000E019F', '\U000E01A0', '\U000E01A1', '\U000E01A2', '\U000E01A3', '\U000E01A4', '\U000E01A5', '\U000E01A6', '\U000E01A7', '\U000E01A8', '\U000E01A9', '\U000E01AA', '\U000E01AB', '\U000E01AC', '\U000E01AD', '\U000E01AE', '\U000E01AF', '\U000E01B0', '\U000E01B1', '\U000E01B2', '\U000E01B3', '\U000E01B4', '\U000E01B5', '\U000E01B6', '\U000E01B7', '\U000E01B8', '\U000E01B9', '\U000E01BA', '\U000E01BB', '\U000E01BC', '\U000E01BD', '\U000E01BE', '\U000E01BF', '\U000E01C0', '\U000E01C1', '\U000E01C2', '\U000E01C3', '\U000E01C4', '\U000E01C5', '\U000E01C6', '\U000E01C7', '\U000E01C8', '\U000E01C9', '\U000E01CA', '\U000E01CB', '\U000E01CC', '\U000E01CD', '\U000E01CE', '\U000E01CF', '\U000E01D0', '\U000E01D1', '\U000E01D2', '\U000E01D3', '\U000E01D4', '\U000E01D5', '\U000E01D6', '\U000E01D7', '\U000E01D8', '\U000E01D9', '\U000E01DA', '\U000E01DB', '\U000E01DC', '\U000E01DD', '\U000E01DE', '\U000E01DF', '\U000E01E0', '\U000E01E1', '\U000E01E2', '\U000E01E3', '\U000E01E4', '\U000E01E5', '\U000E01E6', '\U000E01E7', '\U000E01E8', '\U000E01E9', '\U000E01EA', '\U000E01EB', '\U000E01EC', '\U000E01ED', '\U000E01EE', '\U000E01EF']
MARK_SET = set(chr(c) for c in range(sys.maxunicode + 1) if unicodedata.category(chr(c))[0] == 'M')
print('len(UNICODE_NSM) = {}'.format(len(UNICODE_NSM)))
print('len(MARK_SET) = {}'.format(len(MARK_SET)))
filepath = "UnicodeData.txt"
with open(filepath) as f:
text = f.read()
text = text[:1000]
def main():
ground_truth = loop_count(text)
functions = [(loop_count, 'loop_count'),
(generator_count, 'generator_count'),
(category_count, 'category_count'),
(markset_count, 'markset_count'),
]
functions = functions[::-1]
duration_list = {}
for func, name in functions:
is_correct = func(text) == ground_truth
durations = timeit.repeat(lambda: func(text), repeat=500, number=3)
if is_correct:
correctness = 'correct'
else:
correctness = 'NOT correct'
duration_list[name] = durations
print('{func:<20}: {correctness}, '
'min: {min:0.3f}s, mean: {mean:0.3f}s, max: {max:0.3f}s'
.format(func=name,
correctness=correctness,
min=min(durations),
mean=np.mean(durations),
max=max(durations),
))
create_boxplot(duration_list)
def create_boxplot(duration_list):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300,
facecolor='w', edgecolor='k')
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(*sorted(duration_list.items(), key=operator.itemgetter(1)))
flierprops = dict(markerfacecolor='0.75', markersize=1,
linestyle='none')
ax = sns.boxplot(data=sorted_vals, width=.3, orient='h',
flierprops=flierprops,)
ax.set(xlabel="Time in s", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
def generator_count(text):
return sum(1 for char in text if char not in UNICODE_NSM)
def loop_count(text):
# 1769137
count = 0
for char in text:
if char not in UNICODE_NSM:
count += 1
return count
def markset_count(text):
return sum(char not in MARK_SET for char in text)
def category_count(text):
return sum(unicodedata.category(char) != 'Mn' for char in text)
if __name__ == '__main__':
main()

How to load a CSV with nested arrays

I came across a dataset of Twitter users (Kaggle Source) but I have found that the dataset has a rather strange format. It contains a row with column headers, and then rows of what are essentially JSON arrays. The dataset is also quite large which makes it very difficult to convert the entire file into JSON objects.
What is a good way to load this data into Python, preferably a Pandas Dataframe?
Example of Data
id,screenName,tags,avatar,followersCount,friendsCount,lang,lastSeen,tweetId,friends
"1969527638","LlngoMakeEmCum_",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/534286217882652672/FNmiQYVO_normal.jpeg",319,112,"en",1472271687519,"769310701580083200",[ "1969574754", "1969295556", "1969284056", "1969612214", "1970067476", "1969797386", "1969430539", "1969840064", "1969698176", "1970005154", "283011644", "1969901029", "1969563175", "1969302314", "1969978662", "1969457936", "1969667533", "1969547821", "1969943478", "1969668032", "283006529", "1969809440", "1969601096", "1969298856", "1969331652", "1969385498", "1969674368", "1969565263", "1970144676", "1969745390", "1969947438", "1969734134", "1969801326", "1969324008", "1969259820", "1969535827", "1970072989", "1969771688", "1969437804", "1969507394", "1969509972", "1969751588", "283012808", "1969302888", "1970224440", "1969603532", "283011244", "1969501046", "1969887518", "1970153138", "1970267527", "1969941955", "1969421654", "1970013110", "1969544905", "1969839590", "1969876500", "1969674625", "1969337952", "1970046536", "1970090934", "1969419133", "1969517215", "1969787869", "1969298065", "1970149771", "1969422638", "1969504268", "1970025554", "1969776001", "1970138611", "1969316186", "1969547558", "1969689272", "283009727", "283015491", "1969526874", "1969662210", "1969536164", "1969320008", "1969893793", "1970158393", "1969365936", "1970194418", "1969942094", "1969631580", "1969704756", "1969920092", "1969712882", "1969791680", "1969408164", "1969754851", "1970205480", "1969840267", "1969443211", "1969706762", "1969692698", "1969751576", "1969486796", "1969286630", "1969686674", "1969833492", "1969294814", "1969472719", "1969685018", "283008559", "283011243", "1969680078", "1969545697", "1969646412", "1969442725", "1969692529" ]
"51878493","_notmichelle",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/761977602173046786/4_utEHsD_normal.jpg",275,115,"en",1472270622663,"769309490038439936",[ "60789485", "2420931980", "2899776756", "127410795", "38747286", "1345516880", "236076395", "1242946609", "2567887488", "280777286", "2912446303", "1149916171", "3192577639", "239569380", "229974168", "389097282", "266336410", "1850301204", "2364414805", "812302213", "2318240348", "158634793", "542282350", "569664772", "766573472", "703551325", "168564432", "261054460", "402980453", "562547390", "539630318", "165167145", "22216387", "427568285", "61033129", "213519434", "373092437", "170762012", "273601960", "322108757", "1681816280", "357843027", "737471496", "406541143", "1084122632", "633477616", "537821327", "793079732", "2386380799", "479015607", "783354019", "365171478", "625002575", "2326207404", "1653286842", "1676964216", "2296617326", "1583692190", "1315393903", "377660026", "2235123476", "792779641", "351222527", "444993309", "588396446", "377629159", "469383424", "1726612471", "415230430", "942443390", "360924168", "318593248", "565022085", "319679735", "632508305", "377638254", "1392782078", "584483723", "377703135", "180463340", "564978577", "502517645", "1056960042", "285097108", "410245879", "159121042", "570399371", "502348447", "960927356", "377196638", "478142245", "335043809", "73546116", "11348282", "901302409", "53255593", "515983155", "391774800", "62351523", "724792351", "346296289", "152520627", "559053427", "508019115", "349996133", "378859519", "65120103", "190070557", "339868374", "417355200", "256729771", "16171898", "45266183", "16143507", "165258639" ]
We could start with something like this:
(Might need to rethink the use of | though. We could go for something more exotic like ╡
import pandas as pd
import io
import json
data = '''\
id,screenName,tags,avatar,followersCount,friendsCount,lang,lastSeen,tweetId,friends
"1969527638","LlngoMakeEmCum_",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/534286217882652672/FNmiQYVO_normal.jpeg",319,112,"en",1472271687519,"769310701580083200",[ "1969574754", "1969295556", "1969284056", "1969612214", "1970067476", "1969797386", "1969430539", "1969840064", "1969698176", "1970005154", "283011644", "1969901029", "1969563175", "1969302314", "1969978662", "1969457936", "1969667533", "1969547821", "1969943478", "1969668032", "283006529", "1969809440", "1969601096", "1969298856", "1969331652", "1969385498", "1969674368", "1969565263", "1970144676", "1969745390", "1969947438", "1969734134", "1969801326", "1969324008", "1969259820", "1969535827", "1970072989", "1969771688", "1969437804", "1969507394", "1969509972", "1969751588", "283012808", "1969302888", "1970224440", "1969603532", "283011244", "1969501046", "1969887518", "1970153138", "1970267527", "1969941955", "1969421654", "1970013110", "1969544905", "1969839590", "1969876500", "1969674625", "1969337952", "1970046536", "1970090934", "1969419133", "1969517215", "1969787869", "1969298065", "1970149771", "1969422638", "1969504268", "1970025554", "1969776001", "1970138611", "1969316186", "1969547558", "1969689272", "283009727", "283015491", "1969526874", "1969662210", "1969536164", "1969320008", "1969893793", "1970158393", "1969365936", "1970194418", "1969942094", "1969631580", "1969704756", "1969920092", "1969712882", "1969791680", "1969408164", "1969754851", "1970205480", "1969840267", "1969443211", "1969706762", "1969692698", "1969751576", "1969486796", "1969286630", "1969686674", "1969833492", "1969294814", "1969472719", "1969685018", "283008559", "283011243", "1969680078", "1969545697", "1969646412", "1969442725", "1969692529" ]
"51878493","_notmichelle",[ "#nationaldogday" ],"http://pbs.twimg.com/profile_images/761977602173046786/4_utEHsD_normal.jpg",275,115,"en",1472270622663,"769309490038439936",[ "60789485", "2420931980", "2899776756", "127410795", "38747286", "1345516880", "236076395", "1242946609", "2567887488", "280777286", "2912446303", "1149916171", "3192577639", "239569380", "229974168", "389097282", "266336410", "1850301204", "2364414805", "812302213", "2318240348", "158634793", "542282350", "569664772", "766573472", "703551325", "168564432", "261054460", "402980453", "562547390", "539630318", "165167145", "22216387", "427568285", "61033129", "213519434", "373092437", "170762012", "273601960", "322108757", "1681816280", "357843027", "737471496", "406541143", "1084122632", "633477616", "537821327", "793079732", "2386380799", "479015607", "783354019", "365171478", "625002575", "2326207404", "1653286842", "1676964216", "2296617326", "1583692190", "1315393903", "377660026", "2235123476", "792779641", "351222527", "444993309", "588396446", "377629159", "469383424", "1726612471", "415230430", "942443390", "360924168", "318593248", "565022085", "319679735", "632508305", "377638254", "1392782078", "584483723", "377703135", "180463340", "564978577", "502517645", "1056960042", "285097108", "410245879", "159121042", "570399371", "502348447", "960927356", "377196638", "478142245", "335043809", "73546116", "11348282", "901302409", "53255593", "515983155", "391774800", "62351523", "724792351", "346296289", "152520627", "559053427", "508019115", "349996133", "378859519", "65120103", "190070557", "339868374", "417355200", "256729771", "16171898", "45266183", "16143507", "165258639" ]'''
# Create new separator (|) after 9th comma (',')
data = '\n'.join(['|'.join(row.split(',',9)) for row in data.split('\n')])
# REPLACE WITH THIS FOR REAL FILE
#with open('path/to/file') as f:
#data = '\n'.join(['|'.join(row.split(',',9)) for row in f.read().split('\n')])
# Read dataframe
df = pd.read_csv(io.StringIO(data), sep='|')
# Convert strings to objects with json module:
df['friends'] = df['friends'].apply(lambda x: json.loads(x))
df['tags'] = df['tags'].apply(lambda x: json.loads(x))
Safer approach:
import pandas as pd
import io
import json
with open('path/to/file') as f:
columns, *rows = [row.split(',',9) for row in f.read().split('\n')]
df = pd.DataFrame(rows, columns=columns)
# Convert strings to objects with json module:
df['friends'] = df['friends'].apply(lambda x: json.loads(x))
df['tags'] = df['tags'].apply(lambda x: json.loads(x))

Python Exec Returns More Lines Then Expected

First off I would like to premise this question with, "yes I know I shouldn't use exec, however my data is trusted."
I have a script that creates a bunch of default dictionary lines in a file. Each line contains key,values of time,data. I go through a process to get these into graph, however, the last line seems to print twice so my graph has issues.:
code portion in question:
import fileinput
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pylab import *
filein = 'list.txt'
output_name = "image2.png"
dicts = []
times = []
values = []
temp_csv_open = open(temp_csv, "a")
line = ""
for line in fileinput.input([filein]):
line = line.replace("defaultdict(<type 'int'>,", "data = ")
line = line.replace(")", "")
line = line.strip()
print line # show and tell 1
exec(line)
print data # show and tell 2
for k in sorted(data.iterkeys()):
print k, data[k] # show and tell 3
time = datetime.datetime.strptime(k, "%Y%m%d%H")
times.append(time)
values.append(data[k])
print times, values # show and tell 4
my data:
defaultdict(<type 'int'>, {'2012021310': 76422, '2012021311': 94188, '2012021323': 139363, '2012021312': 111817, '2012021307': 71316, '2012021306': 82418, '2012021305': 65217, '2012021313': 127002, '2012021314': 141099, '2012021315': 147830, '2012021316': 136330, '2012021317': 122252, '2012021318': 118619, '2012021319': 115763, '2012021322': 137658, '2012021321': 130022, '2012021320': 121393, '2012021309': 69406, '2012021308': 66833})
defaultdict(<type 'int'>, {'2012021408': 139745, '2012021409': 143658, '2012021414': 288286, '2012021418': 31216, '2012021416': 268214, '2012021400': 207878, '2012021401': 269296, '2012021402': 270258, '2012021403': 275882, '2012021404': 232521, '2012021405': 195062, '2012021406': 166669, '2012021407': 142855, '2012021417': 245582, '2012021411': 194360, '2012021413': 262078, '2012021410': 158954, '2012021415': 296457, '2012021412': 237083})
defaultdict(<type 'int'>, {'2012021523': 676350, '2012021522': 670147, '2012021521': 650984, '2012021520': 617401, '2012021501': 170448, '2012021503': 246600, '2012021502': 250013, '2012021505': 363866, '2012021504': 300809, '2012021507': 333080, '2012021506': 370454, '2012021509': 343671, '2012021508': 330452, '2012021512': 549736, '2012021513': 622690, '2012021510': 387871, '2012021511': 456171, '2012021516': 647559, '2012021517': 600969, '2012021514': 692257, '2012021515': 706377, '2012021518': 579669, '2012021519': 587969})
defaultdict(<type 'int'>, {'2012021608': 333986, '2012021609': 344126, '2012021602': 651692, '2012021603': 676458, '2012021600': 664484, '2012021601': 686408, '2012021620': 932692, '2012021621': 1065501, '2012021604': 589033, '2012021605': 465191, '2012021623': 1316907, '2012021606': 389669, '2012021607': 342613, '2012021619': 828190, '2012021618': 617836, '2012021622': 1111334, '2012021611': 467532, '2012021610': 387220, '2012021613': 634585, '2012021612': 560227, '2012021615': 718498, '2012021614': 704008, '2012021617': 606396, '2012021616': 665030})
In the above code I've added 4 print statements to help me make my point:
show and tell 1 returns:
data = {'2012021310': 76422, '2012021311': 94188, '2012021323': 139363, '2012021312': 111817, '2012021307': 71316, '2012021306': 82418, '2012021305': 65217, '2012021313': 127002, '2012021314': 141099, '2012021315': 147830, '2012021316': 136330, '2012021317': 122252, '2012021318': 118619, '2012021319': 115763, '2012021322': 137658, '2012021321': 130022, '2012021320': 121393, '2012021309': 69406, '2012021308': 66833}
data = {'2012021408': 139745, '2012021409': 143658, '2012021414': 288286, '2012021418': 31216, '2012021416': 268214, '2012021400': 207878, '2012021401': 269296, '2012021402': 270258, '2012021403': 275882, '2012021404': 232521, '2012021405': 195062, '2012021406': 166669, '2012021407': 142855, '2012021417': 245582, '2012021411': 194360, '2012021413': 262078, '2012021410': 158954, '2012021415': 296457, '2012021412': 237083}
data = {'2012021523': 676350, '2012021522': 670147, '2012021521': 650984, '2012021520': 617401, '2012021501': 170448, '2012021503': 246600, '2012021502': 250013, '2012021505': 363866, '2012021504': 300809, '2012021507': 333080, '2012021506': 370454, '2012021509': 343671, '2012021508': 330452, '2012021512': 549736, '2012021513': 622690, '2012021510': 387871, '2012021511': 456171, '2012021516': 647559, '2012021517': 600969, '2012021514': 692257, '2012021515': 706377, '2012021518': 579669, '2012021519': 587969}
data = {'2012021608': 333986, '2012021609': 344126, '2012021602': 651692, '2012021603': 676458, '2012021600': 664484, '2012021601': 686408, '2012021620': 932692, '2012021621': 1065501, '2012021604': 589033, '2012021605': 465191, '2012021623': 1316907, '2012021606': 389669, '2012021607': 342613, '2012021619': 828190, '2012021618': 617836, '2012021622': 1111334, '2012021611': 467532, '2012021610': 387220, '2012021613': 634585, '2012021612': 560227, '2012021615': 718498, '2012021614': 704008, '2012021617': 606396, '2012021616': 665030}
However here's where the issue is: Show and tell 2 returns:
{'2012021307': 71316, '2012021306': 82418, '2012021305': 65217, '2012021309': 69406, '2012021310': 76422, '2012021311': 94188, '2012021312': 111817, '2012021313': 127002, '2012021314': 141099, '2012021315': 147830, '2012021316': 136330, '2012021317': 122252, '2012021318': 118619, '2012021319': 115763, '2012021308': 66833, '2012021321': 130022, '2012021320': 121393, '2012021323': 139363, '2012021322': 137658}
{'2012021408': 139745, '2012021409': 143658, '2012021403': 275882, '2012021418': 31216, '2012021400': 207878, '2012021416': 268214, '2012021402': 270258, '2012021414': 288286, '2012021404': 232521, '2012021405': 195062, '2012021406': 166669, '2012021407': 142855, '2012021417': 245582, '2012021411': 194360, '2012021401': 269296, '2012021413': 262078, '2012021410': 158954, '2012021415': 296457, '2012021412': 237083}
{'2012021523': 676350, '2012021522': 670147, '2012021521': 650984, '2012021520': 617401, '2012021501': 170448, '2012021503': 246600, '2012021502': 250013, '2012021505': 363866, '2012021504': 300809, '2012021507': 333080, '2012021506': 370454, '2012021509': 343671, '2012021508': 330452, '2012021512': 549736, '2012021513': 622690, '2012021510': 387871, '2012021511': 456171, '2012021516': 647559, '2012021517': 600969, '2012021514': 692257, '2012021515': 706377, '2012021518': 579669, '2012021519': 587969}
{'2012021605': 465191, '2012021608': 333986, '2012021609': 344126, '2012021602': 651692, '2012021603': 676458, '2012021600': 664484, '2012021601': 686408, '2012021606': 389669, '2012021607': 342613, '2012021622': 1111334, '2012021623': 1316907, '2012021620': 932692, '2012021621': 1065501, '2012021619': 828190, '2012021618': 617836, '2012021604': 589033, '2012021611': 467532, '2012021610': 387220, '2012021613': 634585, '2012021612': 560227, '2012021615': 718498, '2012021614': 704008, '2012021617': 606396, '2012021616': 665030}
{'2012021605': 465191, '2012021608': 333986, '2012021609': 344126, '2012021602': 651692, '2012021603': 676458, '2012021600': 664484, '2012021601': 686408, '2012021606': 389669, '2012021607': 342613, '2012021622': 1111334, '2012021623': 1316907, '2012021620': 932692, '2012021621': 1065501, '2012021619': 828190, '2012021618': 617836, '2012021604': 589033, '2012021611': 467532, '2012021610': 387220, '2012021613': 634585, '2012021612': 560227, '2012021615': 718498, '2012021614': 704008, '2012021617': 606396, '2012021616': 665030}
so the obvious question is where is that last line that starts with 2012021605 coming from. Is there something in my use of exec that is wrong or what?
The last line of your file is an empty line.
And as such when the loop gets to it the name data is not rebound to a new value when the line exec(line) happens and thus remains bound to the previous value.

Categories