Keywords extraction in Python - How to handle hyphenated compound words - python

I'm trying to perform keyphrase extraction with Python, using KeyBert and pke PositionRank. You can see an extract of my code below.
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building resulting from the construction has been calculated for each stage in the life-cycle and is disclosed to investors and clients on demand" #text_cleaning(df_tassonomia.iloc[1077].text, sentence_adjustment, stop_words)
# Pke
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number = 5)
extractor.candidate_weighting(window = 10)
keyphrases = extractor.get_n_best(n=10)
print(keyphrases)
# KeyBert
kw_model = KeyBERT(model = "all-mpnet-base-v2")
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range = (1,5),
top_n=10
)
print("")
print(keyphrases_2)
and here the results:
[('cycle global warming potential', 0.44829175082921835), ('life', 0.17858359644549557), ('cycle', 0.15775994057934534), ('building', 0.09131084381406684), ('construction', 0.08860454878871142), ('investors', 0.05426710724030216), ('clients', 0.054111700289631526), ('stage', 0.045672396861507744), ('demand', 0.039158055731066406)]
[('cycle global warming potential', 0.5444), ('building', 0.4479), ('construction', 0.3476), ('investors', 0.1967), ('clients', 0.1519), ('demand', 0.1484), ('cycle', 0.1312), ('stage', 0.0931), ('life', 0.0847)]
I would like to handle hyphenated compound words (as life-cycle in the example) are considered as a unique word, but I cannot understand how to exclude the - from the words separators list.
Thank you in advance for any help.
Francesca

this could be a silly workaround but it may help :
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building
resulting from the construction has been calculated for each stage in
the life-cycle and is disclosed to investors and clients on demand"
# Pke
tokens = text.split()
orignal = set([x for x in tokens if "_" in x])
text = text.replace("-", "_")
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number=5)
extractor.candidate_weighting(window=10)
keyphrases = extractor.get_n_best(n=10)
keyphrases_replaced = []
for pair in keyphrases:
if "_" in pair[0] and pair[0] not in orignal:
keyphrases_replaced.append((pair[0].replace("_","-"),pair[1]))
else:
keyphrases_replaced.append(pair)
print(keyphrases_replaced)
# KeyBert
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range=(1, 5),
top_n=10
)
print("")
print(keyphrases_2)
the out put should look like this:
[('life-cycle global warming potential', 0.5511001220016548), ('life-cycle', 0.20123353586644233), ('construction', 0.11945270995269436), ('building', 0.10637157845606555), ('investors', 0.06675114967366767), ('stage', 0.05503532672910801), ('clients', 0.0507262942318816), ('demand', 0.05056281895492815)]
I hope this help :)

The issue has been fixed in the on the latest pke updates: https://github.com/boudinfl/pke/issues/195
import pke
extractor = pke.unsupervised.TopicRank()
extractor.load_document(input='BERT is a state-of-the-art model.', language='en')
extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
print(extractor.candidates.keys())
now returns this output:
dict_keys(['bert', 'state-of-the-art model'])

Related

want to add left out string in matched string

Below is my example code:
from fuzzywuzzy import fuzz
import json
from itertools import zip_longest
synonyms = open("synonyms.json","r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor","solid state","Corei5 :1135G7 (11th
Generation)","hard
drive","ddr 8gb","something1", "something2",
"something3","HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation","corei7:latest technology"]
vendor = []
buyer = []
for item,value in synonyms.items():
for k,k2 in zip_longest(vendor_data,buyer_data):
for v in value:
if fuzz.token_set_ratio(k,v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item+" "+k)
else:
#didnt get only "something" strings here !
if fuzz.token_set_ratio(k2,v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item+" "+k2)
vendor = list(set(vendor))
buyer = list(set(buyer))
vendor,buyer
Note: "something" string can be anything like "battery" or "display"etc
synonyms json
{
"processor":["corei5","core","corei7","i5","i7","ryzen5","i5 processor","i7
processor","processor i5","processor i7","core generation","core gen"],
"ram":["DDR4","memory","DDR3","DDR","DDR 8gb","DDR 8 gb","DDR 16gb","DDR 16 gb","DDR
32gb","DDR 32 gb","DDR4-"],
"ssd":["solid state drive","solid drive"],
"hdd":["Hard Drive"]
}
what do i need ?
I want to add all "something" string inside vendor list dynamically.
! NOTE -- "something" string can be anything in future.
I want to add "something" string in vendor array which is not a matched value in fuzz>70! I want to basically add left out data also.
for example like below:
current output
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state']
expected output below
['processor Corei5 :1135G7 (11th Generation)',
'i7 processor',
'ram HT (100W) DDR4-2400',
'ram ddr 8gb',
'hdd hard drive',
'ssd solid state',
'something1',
'something2'
'something3'] #something string need to be added in vendor list dynamically.
what silly mistake am I doing ? Thank you.
Here's my attempt:
from fuzzywuzzy import process, fuzz
synonyms = {'processor': ['corei5', 'core', 'corei7', 'i5', 'i7', 'ryzen5', 'i5 processor', 'i7 processor', 'processor i5', 'processor i7', 'core generation', 'core gen'], 'ram': ['DDR4', 'memory', 'DDR3', 'DDR', 'DDR 8gb', 'DDR 8 gb', 'DDR 16gb', 'DDR 16 gb', 'DDR 32gb', 'DDR 32 gb', 'DDR4-'], 'ssd': ['solid state drive', 'solid drive'], 'hdd': ['Hard Drive']}
vendor_data = ['i7 processor', 'solid state', 'Corei5 :1135G7 (11th Generation)', 'hard drive', 'ddr 8gb', 'something1', 'something2', 'something3', 'HT (100W) DDR4-2400']
buyer_data = ['i7 processor 12 generation', 'corei7:latest technology']
def find_synonym(s: str, min_score: int = 60):
results = process.extractBests(s, choices=synonyms, score_cutoff=min_score)
if not results:
return None
return results[0][-1]
def process_data(l: list, min_score: int = 60):
matches = []
no_matches = []
for item in l:
syn = find_synonym(item, min_score=min_score)
if syn is not None:
new_item = f'{syn} {item}' if syn not in item else item
matches.append(new_item)
elif any(fuzz.partial_ratio(s, item) >= min_score for s in synonyms.keys()):
# one of the synonyms is already in the item string
matches.append(item)
else:
no_matches.append(item)
return matches, no_matches
For process_data(vendor_data) we get:
(['i7 processor',
'ssd solid state',
'processor Corei5 :1135G7 (11th Generation)',
'hdd hard drive',
'ram ddr 8gb',
'ram HT (100W) DDR4-2400'],
['something1', 'something2', 'something3'])
And for process_data(buyer_data):
(['i7 processor 12 generation', 'processor corei7:latest technology'], [])
I had to lower the cut-off score to 60 to also get results for ddr 8gb. The process_data function returns 2 lists: One with matches with words from the synonyms dict and one with items without matches. If you want exactly the output you listed in your question, just concatenate the two lists like this:
matches, no_matches = process_data(vendor_data)
matches + no_matches # ['i7 processor', 'ssd solid state', 'processor Corei5 :1135G7 (11th Generation)', 'hdd hard drive', 'ram ddr 8gb', 'ram HT (100W) DDR4-2400', 'something1', 'something2', 'something3']
I have tried to come up with a decent answer (certainly not the cleanest one)
import json
from itertools import zip_longest
from fuzzywuzzy import fuzz
synonyms = open("synonyms.json", "r")
synonyms = json.loads(synonyms.read())
vendor_data = ["i7 processor", "solid state", "Corei5 :1135G7 (11thGeneration)", "hard drive", "ddr 8gb", "something1",
"something2",
"something3", "HT (100W) DDR4-2400"]
buyer_data = ["i7 processor 12 generation", "corei7:latest technology"]
vendor = []
buyer = []
for k, k2 in zip_longest(vendor_data, buyer_data):
has_matched = False
for item, value in synonyms.items():
for v in value:
if fuzz.token_set_ratio(k, v) > 70:
if item in k:
vendor.append(k)
else:
vendor.append(item + " " + k)
if has_matched or k2 is None:
break
else:
has_matched = True
if fuzz.token_set_ratio(k2, v) > 70:
if item in k2:
buyer.append(k2)
else:
buyer.append(item + " " + k2)
if has_matched or k is None:
break
else:
has_matched = True
else:
continue # match not found
break # match is found
else: # only evaluates on normal loop end
# Only something strings
# do something with the new input values
continue
vendor = list(set(vendor))
buyer = list(set(buyer))
I hope you can achieve what you want with this code. Check the docs if you don't know what a for else loop does. TLDR: the else clause executes when the loop terminates normally (not with a break). Note that I put the synonyms loop inside the data loop. This is because we can't certainly know in which synonym group the data belongs, also somethimes the vendor data entry is a processor while the buyer data is memory. Also note that I have assumed an item can't match more than 1 time. If this could be the case you would need to make a more advanced check (just make a counter and break when the counter equals 2 for example).
EDIT:
I took another look at the question and came up with maybe a better answer:
v_dict = dict()
for spec in vendor_data[:]:
for item, choices in synonyms.items():
if process.extractOne(spec, choices)[1] > 70: # don't forget to import process from fuzzywuzzy
v_dict[spec] = item
break
else:
v_dict[spec] = "Something new"
This code matches the strings to the correct type. for example {'i7 processor': 'processor', 'solid state': 'ssd', 'Corei5 :1135G7 (11thGeneration)': 'processor', 'hard drive': 'ssd', 'ddr 8gb': 'ram', 'something1': 'Something new', 'something2': 'Something new', 'something3': 'Something new', 'HT (100W) DDR4-2400': 'ram'}. You can change the "Something new" with watherver you like. You could also do: v_dict[spec] = 0 (on a match) and v_dict[spec] = 1 (on no match). You could then sort the dict ->
it = iter(v_dict.values())
print(sorted(v_dict.keys(), key=lambda x: next(it)))
Which would give the wanted results (more or less), all the recognised items will be first, and then all the unrecognised items. You could do some more advanced sorting on this dict if you want. I think this code gives you enough flexibility to reach your goal.
If I understand correctly, what you are trying to do is match keywords specified by a customer and/or vendor against a predefined database of keywords you have.
First, I would highly recommend using a reversed mapping of the synonyms, so it's faster to lookup, especially when the dataset will grow.
Second, considering the fuzzywuzzy API, it looks like you simply want the best match, so extractOne is a solid choice for that.
Now, extractOne returns the best match and a score:
>>> process.extractOne("cowboys", choices)
("Dallas Cowboys", 90)
I would split the algorithm into two:
A generic part that simply gets the best match, which should always exist (even if it's not a great one)
A filter, where you could adjust the sensitivity of the algorithm, based on different criteria of your application. This sensitivity threshold should set the minimal match quality. If you're below this threshold, just use "untagged" for the category for example.
Here is the final code, which I think is very simple and easy to understand and expand:
import json
from fuzzywuzzy import process
def load_synonyms():
with open('synonyms.json') as fin:
synonyms = json.load(fin)
# Reversing the map makes it much easier to lookup
reversed_synonyms = {}
for key, values in synonyms.items():
for value in values:
reversed_synonyms[value] = key
return reversed_synonyms
def load_vendor_data():
return [
"i7 processor",
"solid state",
"Corei5 :1135G7 (11thGeneration)",
"hard drive",
"ddr 8gb",
"something1",
"something2",
"something3",
"HT (100W) DDR4-2400"
]
def load_customer_data():
return [
"i7 processor 12 generation",
"corei7:latest technology"
]
def get_tag(keyword, synonyms):
THRESHOLD = 80
DEFAULT = 'general'
tag, score = process.extractOne(keyword, synonyms.keys())
return synonyms[tag] if score > THRESHOLD else DEFAULT
def main():
synonyms = load_synonyms()
customer_data = load_customer_data()
vendor_data = load_vendor_data()
data = customer_data + vendor_data
tags_dict = { keyword: get_tag(keyword, synonyms) for keyword in data }
print(json.dumps(tags_dict, indent=4))
if __name__ == '__main__':
main()
When running with the specified inputs, the output is:
{
"i7 processor 12 generation": "processor",
"corei7:latest technology": "processor",
"i7 processor": "processor",
"solid state": "ssd",
"Corei5 :1135G7 (11thGeneration)": "processor",
"hard drive": "hdd",
"ddr 8gb": "ram",
"something1": "general",
"something2": "general",
"something3": "general",
"HT (100W) DDR4-2400": "ram"
}

Python - How to count specific section in a list

I'm brand new to python and I'm struggling how to add certain sections of a cvs file in python. I'm not allowed to use "import cvs"
I'm importing the TipJoke CVS file from https://vincentarelbundock.github.io/Rdatasets/datasets.html
This is the only code I have so far that worked and I'm at a total loss on where to go from here.
if __name__ == '__main__':
from pprint import pprint
from string import punctuation
f = open("TipJoke.csv", "r")
tipList = []
for line in f:
#deletes the quotes
line = line.replace('"', '')
tipList.append(line)
pprint(tipList[])
Output:
[',Card,Tip,Ad,Joke,None\n',
'1,None,1,0,0,1\n',
'2,Joke,1,0,1,0\n',
'3,Ad,0,1,0,0\n',
'4,None,0,0,0,1\n',
'5,None,1,0,0,1\n',
'6,None,0,0,0,1\n',
'7,Ad,0,1,0,0\n',
'8,Ad,0,1,0,0\n',
'9,None,0,0,0,1\n',
'10,None,0,0,0,1\n',
'11,None,1,0,0,1\n',
'12,Ad,0,1,0,0\n',
'13,None,0,0,0,1\n',
'14,Ad,1,1,0,0\n',
'15,Joke,1,0,1,0\n',
'16,Joke,0,0,1,0\n',
'17,Joke,1,0,1,0\n',
'18,None,0,0,0,1\n',
'19,Joke,0,0,1,0\n',
'20,None,0,0,0,1\n',
'21,Ad,1,1,0,0\n',
'22,Ad,1,1,0,0\n',
'23,Ad,0,1,0,0\n',
'24,Joke,0,0,1,0\n',
'25,Joke,1,0,1,0\n',
'26,Joke,0,0,1,0\n',
'27,None,1,0,0,1\n',
'28,Joke,1,0,1,0\n',
'29,Joke,1,0,1,0\n',
'30,None,1,0,0,1\n',
'31,Joke,0,0,1,0\n',
'32,None,1,0,0,1\n',
'33,Joke,1,0,1,0\n',
'34,Ad,0,1,0,0\n',
'35,Joke,0,0,1,0\n',
'36,Ad,1,1,0,0\n',
'37,Joke,0,0,1,0\n',
'38,Ad,0,1,0,0\n',
'39,Joke,0,0,1,0\n',
'40,Joke,0,0,1,0\n',
'41,Joke,1,0,1,0\n',
'42,None,0,0,0,1\n',
'43,None,0,0,0,1\n',
'44,Ad,0,1,0,0\n',
'45,None,0,0,0,1\n',
'46,None,0,0,0,1\n',
'47,Ad,0,1,0,0\n',
'48,Joke,0,0,1,0\n',
'49,Joke,1,0,1,0\n',
'50,None,1,0,0,1\n',
'51,None,0,0,0,1\n',
'52,Joke,1,0,1,0\n',
'53,Joke,1,0,1,0\n',
'54,Joke,0,0,1,0\n',
'55,None,1,0,0,1\n',
'56,Ad,0,1,0,0\n',
'57,Joke,0,0,1,0\n',
'58,None,0,0,0,1\n',
'59,Ad,0,1,0,0\n',
'60,Joke,1,0,1,0\n',
'61,Ad,0,1,0,0\n',
'62,None,1,0,0,1\n',
'63,Joke,0,0,1,0\n',
'64,Ad,0,1,0,0\n',
'65,Joke,0,0,1,0\n',
'66,Ad,0,1,0,0\n',
'67,Ad,0,1,0,0\n',
'68,Ad,0,1,0,0\n',
'69,None,0,0,0,1\n',
'70,Joke,1,0,1,0\n',
'71,None,1,0,0,1\n',
'72,None,0,0,0,1\n',
'73,None,0,0,0,1\n',
'74,Joke,0,0,1,0\n',
'75,Ad,1,1,0,0\n',
'76,Ad,0,1,0,0\n',
'77,Ad,1,1,0,0\n',
'78,Joke,0,0,1,0\n',
'79,Joke,0,0,1,0\n',
'80,Ad,1,1,0,0\n',
'81,Ad,0,1,0,0\n',
'82,None,0,0,0,1\n',
'83,Ad,0,1,0,0\n',
'84,Joke,0,0,1,0\n',
'85,Joke,0,0,1,0\n',
'86,Ad,1,1,0,0\n',
'87,None,1,0,0,1\n',
'88,Joke,1,0,1,0\n',
'89,Ad,0,1,0,0\n',
'90,None,0,0,0,1\n',
'91,None,0,0,0,1\n',
'92,Joke,0,0,1,0\n',
'93,Joke,0,0,1,0\n',
'94,Ad,0,1,0,0\n',
'95,Ad,0,1,0,0\n',
'96,Ad,0,1,0,0\n',
'97,Joke,1,0,1,0\n',
'98,None,0,0,0,1\n',
'99,None,0,0,0,1\n',
'100,None,1,0,0,1\n',
'101,Joke,0,0,1,0\n',
'102,Joke,0,0,1,0\n',
'103,Ad,1,1,0,0\n',
'104,Ad,0,1,0,0\n',
'105,Ad,0,1,0,0\n',
'106,Ad,1,1,0,0\n',
'107,Ad,0,1,0,0\n',
'108,None,0,0,0,1\n',
'109,Ad,0,1,0,0\n',
'110,Joke,1,0,1,0\n',
'111,None,0,0,0,1\n',
'112,Ad,0,1,0,0\n',
'113,Ad,0,1,0,0\n',
'114,None,0,0,0,1\n',
'115,Ad,0,1,0,0\n',
'116,None,0,0,0,1\n',
'117,None,0,0,0,1\n',
'118,Ad,0,1,0,0\n',
'119,None,1,0,0,1\n',
'120,Ad,1,1,0,0\n',
'121,Ad,0,1,0,0\n',
'122,Ad,1,1,0,0\n',
'123,None,0,0,0,1\n',
'124,None,0,0,0,1\n',
'125,Joke,1,0,1,0\n',
'126,Joke,1,0,1,0\n',
'127,Ad,0,1,0,0\n',
'128,Joke,0,0,1,0\n',
'129,Joke,0,0,1,0\n',
'130,Ad,0,1,0,0\n',
'131,None,0,0,0,1\n',
'132,None,0,0,0,1\n',
'133,None,0,0,0,1\n',
'134,Joke,1,0,1,0\n',
'135,Ad,0,1,0,0\n',
'136,None,0,0,0,1\n',
'137,Joke,0,0,1,0\n',
'138,Ad,0,1,0,0\n',
'139,Ad,0,1,0,0\n',
'140,None,0,0,0,1\n',
'141,Joke,0,0,1,0\n',
'142,None,0,0,0,1\n',
'143,Ad,0,1,0,0\n',
'144,None,1,0,0,1\n',
'145,Joke,0,0,1,0\n',
'146,Ad,0,1,0,0\n',
'147,Ad,0,1,0,0\n',
'148,Ad,0,1,0,0\n',
'149,Joke,1,0,1,0\n',
'150,Ad,1,1,0,0\n',
'151,Joke,1,0,1,0\n',
'152,None,0,0,0,1\n',
'153,Ad,0,1,0,0\n',
'154,None,0,0,0,1\n',
'155,None,0,0,0,1\n',
'156,Ad,0,1,0,0\n',
'157,Ad,0,1,0,0\n',
'158,Joke,0,0,1,0\n',
'159,None,0,0,0,1\n',
'160,Joke,1,0,1,0\n',
'161,None,1,0,0,1\n',
'162,Ad,1,1,0,0\n',
'163,Joke,0,0,1,0\n',
'164,Joke,0,0,1,0\n',
'165,Ad,0,1,0,0\n',
'166,Joke,1,0,1,0\n',
'167,Joke,1,0,1,0\n',
'168,Ad,0,1,0,0\n',
'169,Joke,1,0,1,0\n',
'170,Joke,0,0,1,0\n',
'171,Ad,0,1,0,0\n',
'172,Joke,0,0,1,0\n',
'173,Joke,0,0,1,0\n',
'174,Ad,0,1,0,0\n',
'175,None,0,0,0,1\n',
'176,Joke,1,0,1,0\n',
'177,Ad,0,1,0,0\n',
'178,Joke,0,0,1,0\n',
'179,Joke,0,0,1,0\n',
'180,None,0,0,0,1\n',
'181,None,0,0,0,1\n',
'182,Ad,0,1,0,0\n',
'183,None,0,0,0,1\n',
'184,None,0,0,0,1\n',
'185,None,0,0,0,1\n',
'186,None,0,0,0,1\n',
'187,Ad,0,1,0,0\n',
'188,None,1,0,0,1\n',
'189,Ad,0,1,0,0\n',
'190,Ad,0,1,0,0\n',
'191,Ad,0,1,0,0\n',
'192,Joke,1,0,1,0\n',
'193,Joke,0,0,1,0\n',
'194,Ad,0,1,0,0\n',
'195,None,0,0,0,1\n',
'196,Joke,1,0,1,0\n',
'197,Joke,0,0,1,0\n',
'198,Joke,1,0,1,0\n',
'199,Ad,0,1,0,0\n',
'200,None,0,0,0,1\n',
'201,Joke,1,0,1,0\n',
'202,Joke,0,0,1,0\n',
'203,Joke,0,0,1,0\n',
'204,Ad,0,1,0,0\n',
'205,None,0,0,0,1\n',
'206,Ad,0,1,0,0\n',
'207,Ad,0,1,0,0\n',
'208,Joke,0,0,1,0\n',
'209,Ad,0,1,0,0\n',
'210,Joke,0,0,1,0\n',
'211,None,0,0,0,1\n']
I'm currently trying to find the Total number of entries of the specified card type and the Percentage of tips given for the specified card type with two decimal places of precision. The tip column is the 0 or 1 right after the card type (None, Ad, Joke).
if you are allowed with pandas library then
import pandas as pd
df = pd.read_csv("TipJoke.csv")
df is a pandas dataframe object in which you can perform multiple filtering task according to your need.
for example if you want to get data for Joke you can filter like this:
print(df[df["Card"] == "Joke"])
Though, i'm just providing you the direction , not whole logic for your question.
This works
from pprint import pprint
from string import punctuation
counts = {"Joke": 0, "Ad": 0, "None": 0}
with open("TipJoke.csv", "r") as f:
for line in f:
line_clean = line.replace('"', "").replace("\n", "").split(",")
try:
counts[line_clean[1]] += int(line_clean[2])
except:
pass
print(counts)

Fastest way to count non spacing chars in Unicode text in Python

Given the Unicode non spacing marks list - https://www.fileformat.info/info/unicode/category/Mn/list.htm
UNICODE_NSM = ['\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030A', '\u030B', '\u030C', '\u030D', '\u030E', '\u030F', '\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031B', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032A', '\u032B', '\u032C', '\u032D', '\u032E', '\u032F', '\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D', '\u033E', '\u033F', '\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u035F', '\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F', '\u0483', '\u0484', '\u0485', '\u0486', '\u0487', '\u0591', '\u0592', '\u0593', '\u0594', '\u0595', '\u0596', '\u0597', '\u0598', '\u0599', '\u059A', '\u059B', '\u059C', '\u059D', '\u059E', '\u059F', '\u05A0', '\u05A1', '\u05A2', '\u05A3', '\u05A4', '\u05A5', '\u05A6', '\u05A7', '\u05A8', '\u05A9', '\u05AA', '\u05AB', '\u05AC', '\u05AD', '\u05AE', '\u05AF', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC', '\u05BD', '\u05BF', '\u05C1', '\u05C2', '\u05C4', '\u05C5', '\u05C7', '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D', '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E7', '\u06E8', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u0711', '\u0730', '\u0731', '\u0732', '\u0733', '\u0734', '\u0735', '\u0736', '\u0737', '\u0738', '\u0739', '\u073A', '\u073B', '\u073C', '\u073D', '\u073E', '\u073F', '\u0740', '\u0741', '\u0742', '\u0743', '\u0744', '\u0745', '\u0746', '\u0747', '\u0748', '\u0749', '\u074A', '\u07A6', '\u07A7', '\u07A8', '\u07A9', '\u07AA', '\u07AB', '\u07AC', '\u07AD', '\u07AE', '\u07AF', '\u07B0', '\u07EB', '\u07EC', '\u07ED', '\u07EE', '\u07EF', '\u07F0', '\u07F1', '\u07F2', '\u07F3', '\u0816', '\u0817', '\u0818', '\u0819', '\u081B', '\u081C', '\u081D', '\u081E', '\u081F', '\u0820', '\u0821', '\u0822', '\u0823', '\u0825', '\u0826', '\u0827', '\u0829', '\u082A', '\u082B', '\u082C', '\u082D', '\u0859', '\u085A', '\u085B', '\u08E4', '\u08E5', '\u08E6', '\u08E7', '\u08E8', '\u08E9', '\u08EA', '\u08EB', '\u08EC', '\u08ED', '\u08EE', '\u08EF', '\u08F0', '\u08F1', '\u08F2', '\u08F3', '\u08F4', '\u08F5', '\u08F6', '\u08F7', '\u08F8', '\u08F9', '\u08FA', '\u08FB', '\u08FC', '\u08FD', '\u08FE', '\u0900', '\u0901', '\u0902', '\u093A', '\u093C', '\u093E', '\u0941', '\u0942', '\u0943', '\u0944', '\u0945', '\u0946', '\u0947', '\u0948', '\u094D', '\u0951', '\u0952', '\u0953', '\u0954', '\u0955', '\u0956', '\u0957', '\u0962', '\u0963', '\u0981', '\u09BC', '\u09C1', '\u09C2', '\u09C3', '\u09C4', '\u09CD', '\u09E2', '\u09E3', '\u0A01', '\u0A02', '\u0A3C', '\u0A41', '\u0A42', '\u0A47', '\u0A48', '\u0A4B', '\u0A4C', '\u0A4D', '\u0A51', '\u0A70', '\u0A71', '\u0A75', '\u0A81', '\u0A82', '\u0ABC', '\u0AC1', '\u0AC2', '\u0AC3', '\u0AC4', '\u0AC5', '\u0AC7', '\u0AC8', '\u0ACD', '\u0AE2', '\u0AE3', '\u0B01', '\u0B3C', '\u0B3F', '\u0B41', '\u0B42', '\u0B43', '\u0B44', '\u0B4D', '\u0B56', '\u0B62', '\u0B63', '\u0B82', '\u0BC0', '\u0BCD', '\u0C3E', '\u0C3F', '\u0C40', '\u0C46', '\u0C47', '\u0C48', '\u0C4A', '\u0C4B', '\u0C4C', '\u0C4D', '\u0C55', '\u0C56', '\u0C62', '\u0C63', '\u0CBC', '\u0CBF', '\u0CC6', '\u0CCC', '\u0CCD', '\u0CE2', '\u0CE3', '\u0D41', '\u0D42', '\u0D43', '\u0D44', '\u0D4D', '\u0D62', '\u0D63', '\u0DCA', '\u0DD2', '\u0DD3', '\u0DD4', '\u0DD6', '\u0E31', '\u0E34', '\u0E35', '\u0E36', '\u0E37', '\u0E38', '\u0E39', '\u0E3A', '\u0E47', '\u0E48', '\u0E49', '\u0E4A', '\u0E4B', '\u0E4C', '\u0E4D', '\u0E4E', '\u0EB1', '\u0EB4', '\u0EB5', '\u0EB6', '\u0EB7', '\u0EB8', '\u0EB9', '\u0EBB', '\u0EBC', '\u0EC8', '\u0EC9', '\u0ECA', '\u0ECB', '\u0ECC', '\u0ECD', '\u0F18', '\u0F19', '\u0F35', '\u0F37', '\u0F39', '\u0F71', '\u0F72', '\u0F73', '\u0F74', '\u0F75', '\u0F76', '\u0F77', '\u0F78', '\u0F79', '\u0F7A', '\u0F7B', '\u0F7C', '\u0F7D', '\u0F7E', '\u0F80', '\u0F81', '\u0F82', '\u0F83', '\u0F84', '\u0F86', '\u0F87', '\u0F8D', '\u0F8E', '\u0F8F', '\u0F90', '\u0F91', '\u0F92', '\u0F93', '\u0F94', '\u0F95', '\u0F96', '\u0F97', '\u0F99', '\u0F9A', '\u0F9B', '\u0F9C', '\u0F9D', '\u0F9E', '\u0F9F', '\u0FA0', '\u0FA1', '\u0FA2', '\u0FA3', '\u0FA4', '\u0FA5', '\u0FA6', '\u0FA7', '\u0FA8', '\u0FA9', '\u0FAA', '\u0FAB', '\u0FAC', '\u0FAD', '\u0FAE', '\u0FAF', '\u0FB0', '\u0FB1', '\u0FB2', '\u0FB3', '\u0FB4', '\u0FB5', '\u0FB6', '\u0FB7', '\u0FB8', '\u0FB9', '\u0FBA', '\u0FBB', '\u0FBC', '\u0FC6', '\u102D', '\u102E', '\u102F', '\u1030', '\u1032', '\u1033', '\u1034', '\u1035', '\u1036', '\u1037', '\u1039', '\u103A', '\u103D', '\u103E', '\u1058', '\u1059', '\u105E', '\u105F', '\u1060', '\u1071', '\u1072', '\u1073', '\u1074', '\u1082', '\u1085', '\u1086', '\u108D', '\u109D', '\u135D', '\u135E', '\u135F', '\u1712', '\u1713', '\u1714', '\u1732', '\u1733', '\u1734', '\u1752', '\u1753', '\u1772', '\u1773', '\u17B4', '\u17B5', '\u17B7', '\u17B8', '\u17B9', '\u17BA', '\u17BB', '\u17BC', '\u17BD', '\u17C6', '\u17C9', '\u17CA', '\u17CB', '\u17CC', '\u17CD', '\u17CE', '\u17CF', '\u17D0', '\u17D1', '\u17D2', '\u17D3', '\u17DD', '\u180B', '\u180C', '\u180D', '\u18A9', '\u1920', '\u1921', '\u1922', '\u1927', '\u1928', '\u1932', '\u1939', '\u193A', '\u193B', '\u1A17', '\u1A18', '\u1A56', '\u1A58', '\u1A59', '\u1A5A', '\u1A5B', '\u1A5C', '\u1A5D', '\u1A5E', '\u1A60', '\u1A62', '\u1A65', '\u1A66', '\u1A67', '\u1A68', '\u1A69', '\u1A6A', '\u1A6B', '\u1A6C', '\u1A73', '\u1A74', '\u1A75', '\u1A76', '\u1A77', '\u1A78', '\u1A79', '\u1A7A', '\u1A7B', '\u1A7C', '\u1A7F', '\u1B00', '\u1B01', '\u1B02', '\u1B03', '\u1B34', '\u1B36', '\u1B37', '\u1B38', '\u1B39', '\u1B3A', '\u1B3C', '\u1B42', '\u1B6B', '\u1B6C', '\u1B6D', '\u1B6E', '\u1B6F', '\u1B70', '\u1B71', '\u1B72', '\u1B73', '\u1B80', '\u1B81', '\u1BA2', '\u1BA3', '\u1BA4', '\u1BA5', '\u1BA8', '\u1BA9', '\u1BAB', '\u1BE6', '\u1BE8', '\u1BE9', '\u1BED', '\u1BEF', '\u1BF0', '\u1BF1', '\u1C2C', '\u1C2D', '\u1C2E', '\u1C2F', '\u1C30', '\u1C31', '\u1C32', '\u1C33', '\u1C36', '\u1C37', '\u1CD0', '\u1CD1', '\u1CD2', '\u1CD4', '\u1CD5', '\u1CD6', '\u1CD7', '\u1CD8', '\u1CD9', '\u1CDA', '\u1CDB', '\u1CDC', '\u1CDD', '\u1CDE', '\u1CDF', '\u1CE0', '\u1CE2', '\u1CE3', '\u1CE4', '\u1CE5', '\u1CE6', '\u1CE7', '\u1CE8', '\u1CED', '\u1CF4', '\u1DC0', '\u1DC1', '\u1DC2', '\u1DC3', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCB', '\u1DCC', '\u1DCD', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5', '\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1', '\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\u1DFC', '\u1DFD', '\u1DFE', '\u1DFF', '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DB', '\u20DC', '\u20E1', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9', '\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EE', '\u20EF', '\u20F0', '\u2CEF', '\u2CF0', '\u2CF1', '\u2D7F', '\u2DE0', '\u2DE1', '\u2DE2', '\u2DE3', '\u2DE4', '\u2DE5', '\u2DE6', '\u2DE7', '\u2DE8', '\u2DE9', '\u2DEA', '\u2DEB', '\u2DEC', '\u2DED', '\u2DEE', '\u2DEF', '\u2DF0', '\u2DF1', '\u2DF2', '\u2DF3', '\u2DF4', '\u2DF5', '\u2DF6', '\u2DF7', '\u2DF8', '\u2DF9', '\u2DFA', '\u2DFB', '\u2DFC', '\u2DFD', '\u2DFE', '\u2DFF', '\u302A', '\u302B', '\u302C', '\u302D', '\u3099', '\u309A', '\uA66F', '\uA674', '\uA675', '\uA676', '\uA677', '\uA678', '\uA679', '\uA67A', '\uA67B', '\uA67C', '\uA67D', '\uA69F', '\uA6F0', '\uA6F1', '\uA802', '\uA806', '\uA80B', '\uA825', '\uA826', '\uA8C4', '\uA8E0', '\uA8E1', '\uA8E2', '\uA8E3', '\uA8E4', '\uA8E5', '\uA8E6', '\uA8E7', '\uA8E8', '\uA8E9', '\uA8EA', '\uA8EB', '\uA8EC', '\uA8ED', '\uA8EE', '\uA8EF', '\uA8F0', '\uA8F1', '\uA926', '\uA927', '\uA928', '\uA929', '\uA92A', '\uA92B', '\uA92C', '\uA92D', '\uA947', '\uA948', '\uA949', '\uA94A', '\uA94B', '\uA94C', '\uA94D', '\uA94E', '\uA94F', '\uA950', '\uA951', '\uA980', '\uA981', '\uA982', '\uA9B3', '\uA9B6', '\uA9B7', '\uA9B8', '\uA9B9', '\uA9BC', '\uAA29', '\uAA2A', '\uAA2B', '\uAA2C', '\uAA2D', '\uAA2E', '\uAA31', '\uAA32', '\uAA35', '\uAA36', '\uAA43', '\uAA4C', '\uAAB0', '\uAAB2', '\uAAB3', '\uAAB4', '\uAAB7', '\uAAB8', '\uAABE', '\uAABF', '\uAAC1', '\uAAEC', '\uAAED', '\uAAF6', '\uABE5', '\uABE8', '\uABED', '\uFB1E', '\uFE00', '\uFE01', '\uFE02', '\uFE03', '\uFE04', '\uFE05', '\uFE06', '\uFE07', '\uFE08', '\uFE09', '\uFE0A', '\uFE0B', '\uFE0C', '\uFE0D', '\uFE0E', '\uFE0F', '\uFE20', '\uFE21', '\uFE22', '\uFE23', '\uFE24', '\uFE25', '\uFE26', '\U000101FD', '\U00010A01', '\U00010A02', '\U00010A03', '\U00010A05', '\U00010A06', '\U00010A0C', '\U00010A0D', '\U00010A0E', '\U00010A0F', '\U00010A38', '\U00010A39', '\U00010A3A', '\U00010A3F', '\U00011001', '\U00011038', '\U00011039', '\U0001103A', '\U0001103B', '\U0001103C', '\U0001103D', '\U0001103E', '\U0001103F', '\U00011040', '\U00011041', '\U00011042', '\U00011043', '\U00011044', '\U00011045', '\U00011046', '\U00011080', '\U00011081', '\U000110B3', '\U000110B4', '\U000110B5', '\U000110B6', '\U000110B9', '\U000110BA', '\U00011100', '\U00011101', '\U00011102', '\U00011127', '\U00011128', '\U00011129', '\U0001112A', '\U0001112B', '\U0001112D', '\U0001112E', '\U0001112F', '\U00011130', '\U00011131', '\U00011132', '\U00011133', '\U00011134', '\U00011180', '\U00011181', '\U000111B6', '\U000111B7', '\U000111B8', '\U000111B9', '\U000111BA', '\U000111BB', '\U000111BC', '\U000111BD', '\U000111BE', '\U000116AB', '\U000116AD', '\U000116B0', '\U000116B1', '\U000116B2', '\U000116B3', '\U000116B4', '\U000116B5', '\U000116B7', '\U00016F8F', '\U00016F90', '\U00016F91', '\U00016F92', '\U0001D167', '\U0001D168', '\U0001D169', '\U0001D17B', '\U0001D17C', '\U0001D17D', '\U0001D17E', '\U0001D17F', '\U0001D180', '\U0001D181', '\U0001D182', '\U0001D185', '\U0001D186', '\U0001D187', '\U0001D188', '\U0001D189', '\U0001D18A', '\U0001D18B', '\U0001D1AA', '\U0001D1AB', '\U0001D1AC', '\U0001D1AD', '\U0001D242', '\U0001D243', '\U0001D244', '\U000E0100', '\U000E0101', '\U000E0102', '\U000E0103', '\U000E0104', '\U000E0105', '\U000E0106', '\U000E0107', '\U000E0108', '\U000E0109', '\U000E010A', '\U000E010B', '\U000E010C', '\U000E010D', '\U000E010E', '\U000E010F', '\U000E0110', '\U000E0111', '\U000E0112', '\U000E0113', '\U000E0114', '\U000E0115', '\U000E0116', '\U000E0117', '\U000E0118', '\U000E0119', '\U000E011A', '\U000E011B', '\U000E011C', '\U000E011D', '\U000E011E', '\U000E011F', '\U000E0120', '\U000E0121', '\U000E0122', '\U000E0123', '\U000E0124', '\U000E0125', '\U000E0126', '\U000E0127', '\U000E0128', '\U000E0129', '\U000E012A', '\U000E012B', '\uE012C', '\U000E012D', '\U000E012E', '\U000E012F', '\U000E0130', '\U000E0131', '\U000E0132', '\U000E0133', '\U000E0134', '\U000E0135', '\U000E0136', '\U000E0137', '\U000E0138', '\U000E0139', '\U000E013A', '\U000E013B', '\U000E013C', '\U000E013D', '\U000E013E', '\U000E013F', '\U000E0140', '\U000E0141', '\U000E0142', '\U000E0143', '\U000E0144', '\U000E0145', '\U000E0146', '\U000E0147', '\U000E0148', '\U000E0149', '\U000E014A', '\U000E014B', '\U000E014C', '\U000E014D', '\U000E014E', '\U000E014F', '\U000E0150', '\U000E0151', '\U000E0152', '\U000E0153', '\U000E0154', '\U000E0155', '\U000E0156', '\U000E0157', '\U000E0158', '\U000E0159', '\U000E015A', '\U000E015B', '\U000E015C', '\U000E015D', '\U000E015E', '\U000E015F', '\U000E0160', '\U000E0161', '\U000E0162', '\U000E0163', '\U000E0164', '\U000E0165', '\U000E0166', '\U000E0167', '\U000E0168', '\U000E0169', '\U000E016A', '\U000E016B', '\U000E016C', '\U000E016D', '\U000E016E', '\U000E016F', '\U000E0170', '\U000E0171', '\U000E0172', '\U000E0173', '\U000E0174', '\U000E0175', '\U000E0176', '\U000E0177', '\U000E0178', '\U000E0179', '\U000E017A', '\U000E017B', '\U000E017C', '\U000E017D', '\U000E017E', '\U000E017F', '\U000E0180', '\U000E0181', '\U000E0182', '\U000E0183', '\U000E0184', '\U000E0185', '\uE0186', '\U000E0187', '\U000E0188', '\U000E0189', '\U000E018A', '\U000E018B', '\U000E018C', '\U000E018D', '\U000E018E', '\U000E018F', '\U000E0190', '\U000E0191', '\U000E0192', '\U000E0193', '\U000E0194', '\U000E0195', '\U000E0196', '\U000E0197', '\U000E0198', '\U000E0199', '\U000E019A', '\U000E019B', '\U000E019C', '\U000E019D', '\U000E019E', '\U000E019F', '\U000E01A0', '\U000E01A1', '\U000E01A2', '\U000E01A3', '\U000E01A4', '\U000E01A5', '\U000E01A6', '\U000E01A7', '\U000E01A8', '\U000E01A9', '\U000E01AA', '\U000E01AB', '\U000E01AC', '\U000E01AD', '\U000E01AE', '\U000E01AF', '\U000E01B0', '\U000E01B1', '\U000E01B2', '\U000E01B3', '\U000E01B4', '\U000E01B5', '\U000E01B6', '\U000E01B7', '\U000E01B8', '\U000E01B9', '\U000E01BA', '\U000E01BB', '\U000E01BC', '\U000E01BD', '\U000E01BE', '\U000E01BF', '\U000E01C0', '\U000E01C1', '\U000E01C2', '\U000E01C3', '\U000E01C4', '\U000E01C5', '\U000E01C6', '\U000E01C7', '\U000E01C8', '\U000E01C9', '\U000E01CA', '\U000E01CB', '\U000E01CC', '\U000E01CD', '\U000E01CE', '\U000E01CF', '\U000E01D0', '\U000E01D1', '\U000E01D2', '\U000E01D3', '\U000E01D4', '\U000E01D5', '\U000E01D6', '\U000E01D7', '\U000E01D8', '\U000E01D9', '\U000E01DA', '\U000E01DB', '\U000E01DC', '\U000E01DD', '\U000E01DE', '\U000E01DF', '\U000E01E0', '\U000E01E1', '\U000E01E2', '\U000E01E3', '\U000E01E4', '\U000E01E5', '\U000E01E6', '\U000E01E7', '\U000E01E8', '\U000E01E9', '\U000E01EA', '\U000E01EB', '\U000E01EC', '\U000E01ED', '\U000E01EE', '\U000E01EF'];
NOTE.
Please note that we have both \U000XXXXX and \uXXXX representations here.
I want to count the Unicode input text like this Hindi string "अब यहां से कहा जाएँ हम" or just a token word like "समझा", excluding the non spacing characters.
My implementation looks like
def countNonSpacingCharString(str):
count = 0;
for char in str:
if char not in UNICODE_NSM:
count = count + 1
return count
Thanks to the help provided in the answers below I have put all together in this github. There is also a mark codepoints list ready to be used in JavaScript / Node.js - https://github.com/loretoparisi/unicode_marks
Fastest way I came up with. len was slightly faster than sum. I built a set of all combining mark types in the setup.
test.py:
import sys
from unicodedata import category
MARK_SET = set(chr(c) for c in range(sys.maxunicode + 1) if category(chr(c))[0] == 'M')
s = "अब यहां से कहा जाएँ हम"
def count_len(s):
return len([c for c in s if c not in MARK_SET])
def count_sum(s):
return sum([c not in MARK_SET for c in s])
if __name__ == '__main__':
print(len(s))
print(count_len(s))
print(count_sum(s))
Output:
22
16
16
Timings:
C:\>py -m timeit -s "from test import count_sum,s" "count_sum(s)"
50000 loops, best of 5: 4.62 usec per loop
C:\>py -m timeit -s "from test import count_len,s" "count_len(s)"
50000 loops, best of 5: 3.97 usec per loop
It's worth noting that there is a grapheme 3rd party library. grapheme.length(s) == 16, but it was much slower (118us). The full grapheme-detecting algorithm is more complicated than skipping the modifier category. Consider the combining emojis for families and skin colors.
See also Unicode Text Segmentation.
This might be a better alternative:
def countNonSpacingCharString(str):
return len([char for char in str if not(char in UNICODE_NSM)])
How about using a dictionary to look up the values and if not present, increment the count? It should be faster than the former approach because the time complexity to check the presence of the character reduces to O(1).
The implementation should look somewhat like this:
Create a dict and populate it:
lookup_dict = {}
for alpha in UNICODE_NSM:
lookup_dict[alpha] = 1
Look it up while looping through the string:
def countNonSpacingCharString(str):
count = 0;
for char in str:
start_time = time.time()
if not lookup_dict.get(char):
count = count + 1
print("--- %s seconds ---" % (time.time() - start_time))
return count
I must note that using str, as variable name in Python is bad idea, as it is name of built-in function. Anyway I would implement your function following way:
def countNonSpacingCharString(s):
return len(filter(lambda x:not x in UNICODE_NSM,s))
in Python 2
def countNonSpacingCharString(s):
return sum(1 for _ in filter(lambda x:not x in UNICODE_NSM,s))
in Python 3
Inspecting my function using dis.dis showed that it produced less bytecode than your version with count, thus suggesting it might be faster, though this need further investigation.
EDIT: I tested my code in Python 2, but not Python 3 - version for Python 3 added, using Mohammad Banisaeid answer from this topic.
EDIT 2: If you uses UNICODE_NSM only for that, you might try to use set instead of list, which should boost in operator, though again this need further investigation. For discussion about list vs set performance see this thread.
Perhaps the easiest way to do this is to use the unicodedata module. In part, because it will be more rigorously tested. Indeed, I found your list appeared to be including categories other than Mn. That is, it includes Unicode points from Mc (Mark, spacing combining) as well, but you said you only wanted to exclude Unicode points from Mn (Mark, Nonspacing).
eg.
import unicodedata
def countNonSpacingCharString(string):
category = unicodedata.category
return sum(category(char) != 'Mn' for char in string)
This appears to be about 60 times faster according to timeit.
You might get a TypeError, if your version of Python and therefore unicodedata is not up-to-date, and so not aware of recent additions to Unicode. You can get around this by installing unicodedata2 and using that instead.
From your comments it looks like you're really after counting "user perceived characters". This is a complicated process with a number of edge cases. If you can then you should to install regex on your environment (that would be micropython?). You can then do:
>>> parts = regex.findall(r'\X', 'अब यहां से कहा जाएँ हम')
>>> parts
['अ', 'ब', ' ', 'य', 'हां', ' ', 'से', ' ', 'क', 'हा', ' ', 'जा', 'एँ', ' ', 'ह', 'म']
>>> len(parts)
16
Which splits your string into "user perceived characters", and then you can work on this list of strings to get what you need.
Failing that, your current solution of just ignoring Mark code points is an 80/20 solution (gets you most of the way their for the least amount of effort). You will have to revise what your list of Unicode marks though. My tests showed that your list was missing 113 code points across all the Indo-European and Dravidian scripts in Unicode (Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam, and Sinhala).
I extracted these characters by downloading and parsing: https://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt with the following code:
indian_script_range = range(0x0900, 0x0E00) # doesn't include all indic scripts (eg. Thai)
basic_multilingual_plane = range(0x0000, 0x10000)
# use the latter if you want to be more thorough and include all indic scripts and non-indic scripts
codepoint_range = indian_script_range
codepoints = []
with open('UnicodeData.txt') as f:
for line in f:
hex_string, name, category, *rest = line.strip().split(';')
codepoint_number = int(hex_string, base=16)
if (
category in ('Mn', 'Mc', 'Me')
and (
codepoint_number in codepoint_range
or name.startswith('VARIATION SELECTOR') # you seemed to want to include these
)
):
codepoints.append(chr(codepoint_number))
missing = set(codepoints) - set(UNICODE_NSM)
Mark Tolonens answer is the fastest, because it uses a set for comparison. If you have a text of length n and m whitespace-characters to compare with, then your worst-case runtime using two lists is O(nm). Using a set for the whitespace characters reduces that to O(n).
Using unicodedata.category is just nicer because it is shorter and less prone to human error.
Performance comparison
You can clearly see that the markset_count and the category_count are way faster than the generator_count and the loop_count. Also the speed of the latter two varies way more. Interestingly, the generator_count is slower than the loop_count.
The markset_count is a bit faster than the category_count. I think that is the case because looking up the category and doing the string comparison also takes a bit of time. The difference is way more clear when you only plot the two and increase the text length:
import timeit
import sys
import unicodedata
import numpy as np
UNICODE_NSM = ['\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030A', '\u030B', '\u030C', '\u030D', '\u030E', '\u030F', '\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031B', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032A', '\u032B', '\u032C', '\u032D', '\u032E', '\u032F', '\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D', '\u033E', '\u033F', '\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u035F', '\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F', '\u0483', '\u0484', '\u0485', '\u0486', '\u0487', '\u0591', '\u0592', '\u0593', '\u0594', '\u0595', '\u0596', '\u0597', '\u0598', '\u0599', '\u059A', '\u059B', '\u059C', '\u059D', '\u059E', '\u059F', '\u05A0', '\u05A1', '\u05A2', '\u05A3', '\u05A4', '\u05A5', '\u05A6', '\u05A7', '\u05A8', '\u05A9', '\u05AA', '\u05AB', '\u05AC', '\u05AD', '\u05AE', '\u05AF', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC', '\u05BD', '\u05BF', '\u05C1', '\u05C2', '\u05C4', '\u05C5', '\u05C7', '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D', '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E7', '\u06E8', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u0711', '\u0730', '\u0731', '\u0732', '\u0733', '\u0734', '\u0735', '\u0736', '\u0737', '\u0738', '\u0739', '\u073A', '\u073B', '\u073C', '\u073D', '\u073E', '\u073F', '\u0740', '\u0741', '\u0742', '\u0743', '\u0744', '\u0745', '\u0746', '\u0747', '\u0748', '\u0749', '\u074A', '\u07A6', '\u07A7', '\u07A8', '\u07A9', '\u07AA', '\u07AB', '\u07AC', '\u07AD', '\u07AE', '\u07AF', '\u07B0', '\u07EB', '\u07EC', '\u07ED', '\u07EE', '\u07EF', '\u07F0', '\u07F1', '\u07F2', '\u07F3', '\u0816', '\u0817', '\u0818', '\u0819', '\u081B', '\u081C', '\u081D', '\u081E', '\u081F', '\u0820', '\u0821', '\u0822', '\u0823', '\u0825', '\u0826', '\u0827', '\u0829', '\u082A', '\u082B', '\u082C', '\u082D', '\u0859', '\u085A', '\u085B', '\u08E4', '\u08E5', '\u08E6', '\u08E7', '\u08E8', '\u08E9', '\u08EA', '\u08EB', '\u08EC', '\u08ED', '\u08EE', '\u08EF', '\u08F0', '\u08F1', '\u08F2', '\u08F3', '\u08F4', '\u08F5', '\u08F6', '\u08F7', '\u08F8', '\u08F9', '\u08FA', '\u08FB', '\u08FC', '\u08FD', '\u08FE', '\u0900', '\u0901', '\u0902', '\u093A', '\u093C', '\u093E', '\u0941', '\u0942', '\u0943', '\u0944', '\u0945', '\u0946', '\u0947', '\u0948', '\u094D', '\u0951', '\u0952', '\u0953', '\u0954', '\u0955', '\u0956', '\u0957', '\u0962', '\u0963', '\u0981', '\u09BC', '\u09C1', '\u09C2', '\u09C3', '\u09C4', '\u09CD', '\u09E2', '\u09E3', '\u0A01', '\u0A02', '\u0A3C', '\u0A41', '\u0A42', '\u0A47', '\u0A48', '\u0A4B', '\u0A4C', '\u0A4D', '\u0A51', '\u0A70', '\u0A71', '\u0A75', '\u0A81', '\u0A82', '\u0ABC', '\u0AC1', '\u0AC2', '\u0AC3', '\u0AC4', '\u0AC5', '\u0AC7', '\u0AC8', '\u0ACD', '\u0AE2', '\u0AE3', '\u0B01', '\u0B3C', '\u0B3F', '\u0B41', '\u0B42', '\u0B43', '\u0B44', '\u0B4D', '\u0B56', '\u0B62', '\u0B63', '\u0B82', '\u0BC0', '\u0BCD', '\u0C3E', '\u0C3F', '\u0C40', '\u0C46', '\u0C47', '\u0C48', '\u0C4A', '\u0C4B', '\u0C4C', '\u0C4D', '\u0C55', '\u0C56', '\u0C62', '\u0C63', '\u0CBC', '\u0CBF', '\u0CC6', '\u0CCC', '\u0CCD', '\u0CE2', '\u0CE3', '\u0D41', '\u0D42', '\u0D43', '\u0D44', '\u0D4D', '\u0D62', '\u0D63', '\u0DCA', '\u0DD2', '\u0DD3', '\u0DD4', '\u0DD6', '\u0E31', '\u0E34', '\u0E35', '\u0E36', '\u0E37', '\u0E38', '\u0E39', '\u0E3A', '\u0E47', '\u0E48', '\u0E49', '\u0E4A', '\u0E4B', '\u0E4C', '\u0E4D', '\u0E4E', '\u0EB1', '\u0EB4', '\u0EB5', '\u0EB6', '\u0EB7', '\u0EB8', '\u0EB9', '\u0EBB', '\u0EBC', '\u0EC8', '\u0EC9', '\u0ECA', '\u0ECB', '\u0ECC', '\u0ECD', '\u0F18', '\u0F19', '\u0F35', '\u0F37', '\u0F39', '\u0F71', '\u0F72', '\u0F73', '\u0F74', '\u0F75', '\u0F76', '\u0F77', '\u0F78', '\u0F79', '\u0F7A', '\u0F7B', '\u0F7C', '\u0F7D', '\u0F7E', '\u0F80', '\u0F81', '\u0F82', '\u0F83', '\u0F84', '\u0F86', '\u0F87', '\u0F8D', '\u0F8E', '\u0F8F', '\u0F90', '\u0F91', '\u0F92', '\u0F93', '\u0F94', '\u0F95', '\u0F96', '\u0F97', '\u0F99', '\u0F9A', '\u0F9B', '\u0F9C', '\u0F9D', '\u0F9E', '\u0F9F', '\u0FA0', '\u0FA1', '\u0FA2', '\u0FA3', '\u0FA4', '\u0FA5', '\u0FA6', '\u0FA7', '\u0FA8', '\u0FA9', '\u0FAA', '\u0FAB', '\u0FAC', '\u0FAD', '\u0FAE', '\u0FAF', '\u0FB0', '\u0FB1', '\u0FB2', '\u0FB3', '\u0FB4', '\u0FB5', '\u0FB6', '\u0FB7', '\u0FB8', '\u0FB9', '\u0FBA', '\u0FBB', '\u0FBC', '\u0FC6', '\u102D', '\u102E', '\u102F', '\u1030', '\u1032', '\u1033', '\u1034', '\u1035', '\u1036', '\u1037', '\u1039', '\u103A', '\u103D', '\u103E', '\u1058', '\u1059', '\u105E', '\u105F', '\u1060', '\u1071', '\u1072', '\u1073', '\u1074', '\u1082', '\u1085', '\u1086', '\u108D', '\u109D', '\u135D', '\u135E', '\u135F', '\u1712', '\u1713', '\u1714', '\u1732', '\u1733', '\u1734', '\u1752', '\u1753', '\u1772', '\u1773', '\u17B4', '\u17B5', '\u17B7', '\u17B8', '\u17B9', '\u17BA', '\u17BB', '\u17BC', '\u17BD', '\u17C6', '\u17C9', '\u17CA', '\u17CB', '\u17CC', '\u17CD', '\u17CE', '\u17CF', '\u17D0', '\u17D1', '\u17D2', '\u17D3', '\u17DD', '\u180B', '\u180C', '\u180D', '\u18A9', '\u1920', '\u1921', '\u1922', '\u1927', '\u1928', '\u1932', '\u1939', '\u193A', '\u193B', '\u1A17', '\u1A18', '\u1A56', '\u1A58', '\u1A59', '\u1A5A', '\u1A5B', '\u1A5C', '\u1A5D', '\u1A5E', '\u1A60', '\u1A62', '\u1A65', '\u1A66', '\u1A67', '\u1A68', '\u1A69', '\u1A6A', '\u1A6B', '\u1A6C', '\u1A73', '\u1A74', '\u1A75', '\u1A76', '\u1A77', '\u1A78', '\u1A79', '\u1A7A', '\u1A7B', '\u1A7C', '\u1A7F', '\u1B00', '\u1B01', '\u1B02', '\u1B03', '\u1B34', '\u1B36', '\u1B37', '\u1B38', '\u1B39', '\u1B3A', '\u1B3C', '\u1B42', '\u1B6B', '\u1B6C', '\u1B6D', '\u1B6E', '\u1B6F', '\u1B70', '\u1B71', '\u1B72', '\u1B73', '\u1B80', '\u1B81', '\u1BA2', '\u1BA3', '\u1BA4', '\u1BA5', '\u1BA8', '\u1BA9', '\u1BAB', '\u1BE6', '\u1BE8', '\u1BE9', '\u1BED', '\u1BEF', '\u1BF0', '\u1BF1', '\u1C2C', '\u1C2D', '\u1C2E', '\u1C2F', '\u1C30', '\u1C31', '\u1C32', '\u1C33', '\u1C36', '\u1C37', '\u1CD0', '\u1CD1', '\u1CD2', '\u1CD4', '\u1CD5', '\u1CD6', '\u1CD7', '\u1CD8', '\u1CD9', '\u1CDA', '\u1CDB', '\u1CDC', '\u1CDD', '\u1CDE', '\u1CDF', '\u1CE0', '\u1CE2', '\u1CE3', '\u1CE4', '\u1CE5', '\u1CE6', '\u1CE7', '\u1CE8', '\u1CED', '\u1CF4', '\u1DC0', '\u1DC1', '\u1DC2', '\u1DC3', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCB', '\u1DCC', '\u1DCD', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5', '\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1', '\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\u1DFC', '\u1DFD', '\u1DFE', '\u1DFF', '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DB', '\u20DC', '\u20E1', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9', '\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EE', '\u20EF', '\u20F0', '\u2CEF', '\u2CF0', '\u2CF1', '\u2D7F', '\u2DE0', '\u2DE1', '\u2DE2', '\u2DE3', '\u2DE4', '\u2DE5', '\u2DE6', '\u2DE7', '\u2DE8', '\u2DE9', '\u2DEA', '\u2DEB', '\u2DEC', '\u2DED', '\u2DEE', '\u2DEF', '\u2DF0', '\u2DF1', '\u2DF2', '\u2DF3', '\u2DF4', '\u2DF5', '\u2DF6', '\u2DF7', '\u2DF8', '\u2DF9', '\u2DFA', '\u2DFB', '\u2DFC', '\u2DFD', '\u2DFE', '\u2DFF', '\u302A', '\u302B', '\u302C', '\u302D', '\u3099', '\u309A', '\uA66F', '\uA674', '\uA675', '\uA676', '\uA677', '\uA678', '\uA679', '\uA67A', '\uA67B', '\uA67C', '\uA67D', '\uA69F', '\uA6F0', '\uA6F1', '\uA802', '\uA806', '\uA80B', '\uA825', '\uA826', '\uA8C4', '\uA8E0', '\uA8E1', '\uA8E2', '\uA8E3', '\uA8E4', '\uA8E5', '\uA8E6', '\uA8E7', '\uA8E8', '\uA8E9', '\uA8EA', '\uA8EB', '\uA8EC', '\uA8ED', '\uA8EE', '\uA8EF', '\uA8F0', '\uA8F1', '\uA926', '\uA927', '\uA928', '\uA929', '\uA92A', '\uA92B', '\uA92C', '\uA92D', '\uA947', '\uA948', '\uA949', '\uA94A', '\uA94B', '\uA94C', '\uA94D', '\uA94E', '\uA94F', '\uA950', '\uA951', '\uA980', '\uA981', '\uA982', '\uA9B3', '\uA9B6', '\uA9B7', '\uA9B8', '\uA9B9', '\uA9BC', '\uAA29', '\uAA2A', '\uAA2B', '\uAA2C', '\uAA2D', '\uAA2E', '\uAA31', '\uAA32', '\uAA35', '\uAA36', '\uAA43', '\uAA4C', '\uAAB0', '\uAAB2', '\uAAB3', '\uAAB4', '\uAAB7', '\uAAB8', '\uAABE', '\uAABF', '\uAAC1', '\uAAEC', '\uAAED', '\uAAF6', '\uABE5', '\uABE8', '\uABED', '\uFB1E', '\uFE00', '\uFE01', '\uFE02', '\uFE03', '\uFE04', '\uFE05', '\uFE06', '\uFE07', '\uFE08', '\uFE09', '\uFE0A', '\uFE0B', '\uFE0C', '\uFE0D', '\uFE0E', '\uFE0F', '\uFE20', '\uFE21', '\uFE22', '\uFE23', '\uFE24', '\uFE25', '\uFE26', '\U000101FD', '\U00010A01', '\U00010A02', '\U00010A03', '\U00010A05', '\U00010A06', '\U00010A0C', '\U00010A0D', '\U00010A0E', '\U00010A0F', '\U00010A38', '\U00010A39', '\U00010A3A', '\U00010A3F', '\U00011001', '\U00011038', '\U00011039', '\U0001103A', '\U0001103B', '\U0001103C', '\U0001103D', '\U0001103E', '\U0001103F', '\U00011040', '\U00011041', '\U00011042', '\U00011043', '\U00011044', '\U00011045', '\U00011046', '\U00011080', '\U00011081', '\U000110B3', '\U000110B4', '\U000110B5', '\U000110B6', '\U000110B9', '\U000110BA', '\U00011100', '\U00011101', '\U00011102', '\U00011127', '\U00011128', '\U00011129', '\U0001112A', '\U0001112B', '\U0001112D', '\U0001112E', '\U0001112F', '\U00011130', '\U00011131', '\U00011132', '\U00011133', '\U00011134', '\U00011180', '\U00011181', '\U000111B6', '\U000111B7', '\U000111B8', '\U000111B9', '\U000111BA', '\U000111BB', '\U000111BC', '\U000111BD', '\U000111BE', '\U000116AB', '\U000116AD', '\U000116B0', '\U000116B1', '\U000116B2', '\U000116B3', '\U000116B4', '\U000116B5', '\U000116B7', '\U00016F8F', '\U00016F90', '\U00016F91', '\U00016F92', '\U0001D167', '\U0001D168', '\U0001D169', '\U0001D17B', '\U0001D17C', '\U0001D17D', '\U0001D17E', '\U0001D17F', '\U0001D180', '\U0001D181', '\U0001D182', '\U0001D185', '\U0001D186', '\U0001D187', '\U0001D188', '\U0001D189', '\U0001D18A', '\U0001D18B', '\U0001D1AA', '\U0001D1AB', '\U0001D1AC', '\U0001D1AD', '\U0001D242', '\U0001D243', '\U0001D244', '\U000E0100', '\U000E0101', '\U000E0102', '\U000E0103', '\U000E0104', '\U000E0105', '\U000E0106', '\U000E0107', '\U000E0108', '\U000E0109', '\U000E010A', '\U000E010B', '\U000E010C', '\U000E010D', '\U000E010E', '\U000E010F', '\U000E0110', '\U000E0111', '\U000E0112', '\U000E0113', '\U000E0114', '\U000E0115', '\U000E0116', '\U000E0117', '\U000E0118', '\U000E0119', '\U000E011A', '\U000E011B', '\U000E011C', '\U000E011D', '\U000E011E', '\U000E011F', '\U000E0120', '\U000E0121', '\U000E0122', '\U000E0123', '\U000E0124', '\U000E0125', '\U000E0126', '\U000E0127', '\U000E0128', '\U000E0129', '\U000E012A', '\U000E012B', '\uE012C', '\U000E012D', '\U000E012E', '\U000E012F', '\U000E0130', '\U000E0131', '\U000E0132', '\U000E0133', '\U000E0134', '\U000E0135', '\U000E0136', '\U000E0137', '\U000E0138', '\U000E0139', '\U000E013A', '\U000E013B', '\U000E013C', '\U000E013D', '\U000E013E', '\U000E013F', '\U000E0140', '\U000E0141', '\U000E0142', '\U000E0143', '\U000E0144', '\U000E0145', '\U000E0146', '\U000E0147', '\U000E0148', '\U000E0149', '\U000E014A', '\U000E014B', '\U000E014C', '\U000E014D', '\U000E014E', '\U000E014F', '\U000E0150', '\U000E0151', '\U000E0152', '\U000E0153', '\U000E0154', '\U000E0155', '\U000E0156', '\U000E0157', '\U000E0158', '\U000E0159', '\U000E015A', '\U000E015B', '\U000E015C', '\U000E015D', '\U000E015E', '\U000E015F', '\U000E0160', '\U000E0161', '\U000E0162', '\U000E0163', '\U000E0164', '\U000E0165', '\U000E0166', '\U000E0167', '\U000E0168', '\U000E0169', '\U000E016A', '\U000E016B', '\U000E016C', '\U000E016D', '\U000E016E', '\U000E016F', '\U000E0170', '\U000E0171', '\U000E0172', '\U000E0173', '\U000E0174', '\U000E0175', '\U000E0176', '\U000E0177', '\U000E0178', '\U000E0179', '\U000E017A', '\U000E017B', '\U000E017C', '\U000E017D', '\U000E017E', '\U000E017F', '\U000E0180', '\U000E0181', '\U000E0182', '\U000E0183', '\U000E0184', '\U000E0185', '\uE0186', '\U000E0187', '\U000E0188', '\U000E0189', '\U000E018A', '\U000E018B', '\U000E018C', '\U000E018D', '\U000E018E', '\U000E018F', '\U000E0190', '\U000E0191', '\U000E0192', '\U000E0193', '\U000E0194', '\U000E0195', '\U000E0196', '\U000E0197', '\U000E0198', '\U000E0199', '\U000E019A', '\U000E019B', '\U000E019C', '\U000E019D', '\U000E019E', '\U000E019F', '\U000E01A0', '\U000E01A1', '\U000E01A2', '\U000E01A3', '\U000E01A4', '\U000E01A5', '\U000E01A6', '\U000E01A7', '\U000E01A8', '\U000E01A9', '\U000E01AA', '\U000E01AB', '\U000E01AC', '\U000E01AD', '\U000E01AE', '\U000E01AF', '\U000E01B0', '\U000E01B1', '\U000E01B2', '\U000E01B3', '\U000E01B4', '\U000E01B5', '\U000E01B6', '\U000E01B7', '\U000E01B8', '\U000E01B9', '\U000E01BA', '\U000E01BB', '\U000E01BC', '\U000E01BD', '\U000E01BE', '\U000E01BF', '\U000E01C0', '\U000E01C1', '\U000E01C2', '\U000E01C3', '\U000E01C4', '\U000E01C5', '\U000E01C6', '\U000E01C7', '\U000E01C8', '\U000E01C9', '\U000E01CA', '\U000E01CB', '\U000E01CC', '\U000E01CD', '\U000E01CE', '\U000E01CF', '\U000E01D0', '\U000E01D1', '\U000E01D2', '\U000E01D3', '\U000E01D4', '\U000E01D5', '\U000E01D6', '\U000E01D7', '\U000E01D8', '\U000E01D9', '\U000E01DA', '\U000E01DB', '\U000E01DC', '\U000E01DD', '\U000E01DE', '\U000E01DF', '\U000E01E0', '\U000E01E1', '\U000E01E2', '\U000E01E3', '\U000E01E4', '\U000E01E5', '\U000E01E6', '\U000E01E7', '\U000E01E8', '\U000E01E9', '\U000E01EA', '\U000E01EB', '\U000E01EC', '\U000E01ED', '\U000E01EE', '\U000E01EF']
MARK_SET = set(chr(c) for c in range(sys.maxunicode + 1) if unicodedata.category(chr(c))[0] == 'M')
print('len(UNICODE_NSM) = {}'.format(len(UNICODE_NSM)))
print('len(MARK_SET) = {}'.format(len(MARK_SET)))
filepath = "UnicodeData.txt"
with open(filepath) as f:
text = f.read()
text = text[:1000]
def main():
ground_truth = loop_count(text)
functions = [(loop_count, 'loop_count'),
(generator_count, 'generator_count'),
(category_count, 'category_count'),
(markset_count, 'markset_count'),
]
functions = functions[::-1]
duration_list = {}
for func, name in functions:
is_correct = func(text) == ground_truth
durations = timeit.repeat(lambda: func(text), repeat=500, number=3)
if is_correct:
correctness = 'correct'
else:
correctness = 'NOT correct'
duration_list[name] = durations
print('{func:<20}: {correctness}, '
'min: {min:0.3f}s, mean: {mean:0.3f}s, max: {max:0.3f}s'
.format(func=name,
correctness=correctness,
min=min(durations),
mean=np.mean(durations),
max=max(durations),
))
create_boxplot(duration_list)
def create_boxplot(duration_list):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300,
facecolor='w', edgecolor='k')
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(*sorted(duration_list.items(), key=operator.itemgetter(1)))
flierprops = dict(markerfacecolor='0.75', markersize=1,
linestyle='none')
ax = sns.boxplot(data=sorted_vals, width=.3, orient='h',
flierprops=flierprops,)
ax.set(xlabel="Time in s", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
def generator_count(text):
return sum(1 for char in text if char not in UNICODE_NSM)
def loop_count(text):
# 1769137
count = 0
for char in text:
if char not in UNICODE_NSM:
count += 1
return count
def markset_count(text):
return sum(char not in MARK_SET for char in text)
def category_count(text):
return sum(unicodedata.category(char) != 'Mn' for char in text)
if __name__ == '__main__':
main()

wish to extract compound noun-adjective pairs from a sentence. So, basically I want something like :

For the adjective:
"The company's customer service was terrible."
{customer service, terrible}
For the verb:
"They kept increasing my phone bill"
{phone bill, increasing}
This is a branch questions from this posting
However I'm trying to find adj and verbs corresponding to multi-token phrases/compound nouns such as "customer service" using spacy.
I'm not sure how to do this with spacy, nltk, or any other prepackaged natural language processing software, and I'd appreciate any help!
For simple examples like this, you can use spaCy's dependency parsing with a few simple rules.
First, to identify multi-word nouns similar to the examples given, you can use the "compound" dependency. After parsing a document (e.g., sentence) with spaCy, use a token's dep_ attribute to find it's dependency.
For example, this sentence has two compound nouns:
"The compound dependency identifies compound nouns."
Each token and its dependency is shown below:
import spacy
import pandas as pd
nlp = spacy.load('en')
example_doc = nlp("The compound dependency identifies compound nouns.")
for tok in example_doc:
print(tok.i, tok, "[", tok.dep_, "]")
>>>0 The [ det ]
>>>1 compound [ compound ]
>>>2 dependency [ nsubj ]
>>>3 identifies [ ROOT ]
>>>4 compound [ compound ]
>>>5 nouns [ dobj ]
>>>6 . [ punct ]
for tok in [tok for tok in example_doc if tok.dep_ == 'compound']: # Get list of
compounds in doc
noun = example_doc[tok.i: tok.head.i + 1]
print(noun)
>>>compound dependency
>>>compound nouns
The below function works for your examples. However, it will likely not work for more complicated sentences.
adj_doc = nlp("The company's customer service was terrible.")
verb_doc = nlp("They kept increasing my phone bill")
def get_compound_pairs(doc, verbose=False):
"""Return tuples of (multi-noun word, adjective or verb) for document."""
compounds = [tok for tok in doc if tok.dep_ == 'compound'] # Get list of compounds in doc
compounds = [c for c in compounds if c.i == 0 or doc[c.i - 1].dep_ != 'compound'] # Remove middle parts of compound nouns, but avoid index errors
tuple_list = []
if compounds:
for tok in compounds:
pair_item_1, pair_item_2 = (False, False) # initialize false variables
noun = doc[tok.i: tok.head.i + 1]
pair_item_1 = noun
# If noun is in the subject, we may be looking for adjective in predicate
# In simple cases, this would mean that the noun shares a head with the adjective
if noun.root.dep_ == 'nsubj':
adj_list = [r for r in noun.root.head.rights if r.pos_ == 'ADJ']
if adj_list:
pair_item_2 = adj_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head rights: ", [r for r in noun.root.head.rights if r.pos_ == 'ADJ'])
if noun.root.dep_ == 'dobj':
verb_ancestor_list = [a for a in noun.root.ancestors if a.pos_ == 'VERB']
if verb_ancestor_list:
pair_item_2 = verb_ancestor_list[0]
if verbose == True: # For trying different dependency tree parsing rules
print("Noun: ", noun)
print("Noun root: ", noun.root)
print("Noun root head: ", noun.root.head)
print("Noun root head verb ancestors: ", [a for a in noun.root.ancestors if a.pos_ == 'VERB'])
if pair_item_1 and pair_item_2:
tuple_list.append((pair_item_1, pair_item_2))
return tuple_list
get_compound_pairs(adj_doc)
>>>[(customer service, terrible)]
get_compound_pairs(verb_doc)
>>>[(phone bill, increasing)]
get_compound_pairs(example_doc, verbose=True)
>>>Noun: compound dependency
>>>Noun root: dependency
>>>Noun root head: identifies
>>>Noun root head rights: []
>>>Noun: compound nouns
>>>Noun root: nouns
>>>Noun root head: identifies
>>>Noun root head verb ancestors: [identifies]
>>>[(compound nouns, identifies)]
I needed to solve a similar problem and I wanted to share my solution as Spacy.io custom component.
import spacy
from spacy.tokens import Token, Span
from spacy.language import Language
#Language.component("compound_chainer")
def find_compounds(doc):
Token.set_extension("is_compound_chain", default=False)
com_range = []
max_ind = len(doc)
for idx, tok in enumerate(doc):
if((tok.dep_ == "compound") and (idx < max_ind)):
com_range.append([idx, idx+1])
to_remove = []
intersections = []
for t1 in com_range:
for t2 in com_range:
if(t1 != t2):
s1 = set(t1)
s2 = set(t2)
if(len(s1.intersection(s2)) > 0):
to_remove.append(t1)
to_remove.append(t2)
union = list(s1.union(s2))
if union not in intersections:
intersections.append(union)
r = [t for t in com_range if t not in to_remove]
compound_ranges = r + intersections
spans = []
for cr in compound_ranges:
# Example cr [[0, 1], [3, 4], [12, 13], [16, 17, 18]]
entity = Span(doc, min(cr), max(cr)+1, label="compound_chain")
for token in entity:
token._.set("is_compound_chain", True)
spans.append(entity)
doc.ents = list(doc.ents) + spans
return doc
Github link: https://github.com/eboraks/job-description-nlp-analysis/blob/main/src/components/compound_chainer.py

Truecasing - SpaCy

Intent is to capitalize based on POS tags, which I could achieve with the help of the below link.
How can I best determine the correct capitalization for a word?
Trying to achieve similar results using spacy?
def truecase(doc):
truecased_sents = [] # list of truecased sentences
tagged_sent = token.tag_([word.lower() for token in doc])
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
return string
it throws out this error
tagged_sent = token.tag_([word.lower() for token in doc])
NameError: global name 'token' is not defined
how to declare token as global and solve this issue. Is my approach correct?
import spacy, re
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'autonomous cars shift insurance liability toward manufacturers.')
tagged_sent = [(w.text, w.tag_) for w in doc]
normalized_sent = [w.capitalize() if t in ["NN","NNS"] else w for (w,t) in tagged_sent]
normalized_sent[0] = normalized_sent[0].capitalize()
string = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))
print string
Output:
Autonomous Cars shift Insurance Liability toward Manufacturers.

Categories