Given the Unicode non spacing marks list - https://www.fileformat.info/info/unicode/category/Mn/list.htm
UNICODE_NSM = ['\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030A', '\u030B', '\u030C', '\u030D', '\u030E', '\u030F', '\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031B', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032A', '\u032B', '\u032C', '\u032D', '\u032E', '\u032F', '\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D', '\u033E', '\u033F', '\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u035F', '\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F', '\u0483', '\u0484', '\u0485', '\u0486', '\u0487', '\u0591', '\u0592', '\u0593', '\u0594', '\u0595', '\u0596', '\u0597', '\u0598', '\u0599', '\u059A', '\u059B', '\u059C', '\u059D', '\u059E', '\u059F', '\u05A0', '\u05A1', '\u05A2', '\u05A3', '\u05A4', '\u05A5', '\u05A6', '\u05A7', '\u05A8', '\u05A9', '\u05AA', '\u05AB', '\u05AC', '\u05AD', '\u05AE', '\u05AF', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC', '\u05BD', '\u05BF', '\u05C1', '\u05C2', '\u05C4', '\u05C5', '\u05C7', '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D', '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E7', '\u06E8', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u0711', '\u0730', '\u0731', '\u0732', '\u0733', '\u0734', '\u0735', '\u0736', '\u0737', '\u0738', '\u0739', '\u073A', '\u073B', '\u073C', '\u073D', '\u073E', '\u073F', '\u0740', '\u0741', '\u0742', '\u0743', '\u0744', '\u0745', '\u0746', '\u0747', '\u0748', '\u0749', '\u074A', '\u07A6', '\u07A7', '\u07A8', '\u07A9', '\u07AA', '\u07AB', '\u07AC', '\u07AD', '\u07AE', '\u07AF', '\u07B0', '\u07EB', '\u07EC', '\u07ED', '\u07EE', '\u07EF', '\u07F0', '\u07F1', '\u07F2', '\u07F3', '\u0816', '\u0817', '\u0818', '\u0819', '\u081B', '\u081C', '\u081D', '\u081E', '\u081F', '\u0820', '\u0821', '\u0822', '\u0823', '\u0825', '\u0826', '\u0827', '\u0829', '\u082A', '\u082B', '\u082C', '\u082D', '\u0859', '\u085A', '\u085B', '\u08E4', '\u08E5', '\u08E6', '\u08E7', '\u08E8', '\u08E9', '\u08EA', '\u08EB', '\u08EC', '\u08ED', '\u08EE', '\u08EF', '\u08F0', '\u08F1', '\u08F2', '\u08F3', '\u08F4', '\u08F5', '\u08F6', '\u08F7', '\u08F8', '\u08F9', '\u08FA', '\u08FB', '\u08FC', '\u08FD', '\u08FE', '\u0900', '\u0901', '\u0902', '\u093A', '\u093C', '\u093E', '\u0941', '\u0942', '\u0943', '\u0944', '\u0945', '\u0946', '\u0947', '\u0948', '\u094D', '\u0951', '\u0952', '\u0953', '\u0954', '\u0955', '\u0956', '\u0957', '\u0962', '\u0963', '\u0981', '\u09BC', '\u09C1', '\u09C2', '\u09C3', '\u09C4', '\u09CD', '\u09E2', '\u09E3', '\u0A01', '\u0A02', '\u0A3C', '\u0A41', '\u0A42', '\u0A47', '\u0A48', '\u0A4B', '\u0A4C', '\u0A4D', '\u0A51', '\u0A70', '\u0A71', '\u0A75', '\u0A81', '\u0A82', '\u0ABC', '\u0AC1', '\u0AC2', '\u0AC3', '\u0AC4', '\u0AC5', '\u0AC7', '\u0AC8', '\u0ACD', '\u0AE2', '\u0AE3', '\u0B01', '\u0B3C', '\u0B3F', '\u0B41', '\u0B42', '\u0B43', '\u0B44', '\u0B4D', '\u0B56', '\u0B62', '\u0B63', '\u0B82', '\u0BC0', '\u0BCD', '\u0C3E', '\u0C3F', '\u0C40', '\u0C46', '\u0C47', '\u0C48', '\u0C4A', '\u0C4B', '\u0C4C', '\u0C4D', '\u0C55', '\u0C56', '\u0C62', '\u0C63', '\u0CBC', '\u0CBF', '\u0CC6', '\u0CCC', '\u0CCD', '\u0CE2', '\u0CE3', '\u0D41', '\u0D42', '\u0D43', '\u0D44', '\u0D4D', '\u0D62', '\u0D63', '\u0DCA', '\u0DD2', '\u0DD3', '\u0DD4', '\u0DD6', '\u0E31', '\u0E34', '\u0E35', '\u0E36', '\u0E37', '\u0E38', '\u0E39', '\u0E3A', '\u0E47', '\u0E48', '\u0E49', '\u0E4A', '\u0E4B', '\u0E4C', '\u0E4D', '\u0E4E', '\u0EB1', '\u0EB4', '\u0EB5', '\u0EB6', '\u0EB7', '\u0EB8', '\u0EB9', '\u0EBB', '\u0EBC', '\u0EC8', '\u0EC9', '\u0ECA', '\u0ECB', '\u0ECC', '\u0ECD', '\u0F18', '\u0F19', '\u0F35', '\u0F37', '\u0F39', '\u0F71', '\u0F72', '\u0F73', '\u0F74', '\u0F75', '\u0F76', '\u0F77', '\u0F78', '\u0F79', '\u0F7A', '\u0F7B', '\u0F7C', '\u0F7D', '\u0F7E', '\u0F80', '\u0F81', '\u0F82', '\u0F83', '\u0F84', '\u0F86', '\u0F87', '\u0F8D', '\u0F8E', '\u0F8F', '\u0F90', '\u0F91', '\u0F92', '\u0F93', '\u0F94', '\u0F95', '\u0F96', '\u0F97', '\u0F99', '\u0F9A', '\u0F9B', '\u0F9C', '\u0F9D', '\u0F9E', '\u0F9F', '\u0FA0', '\u0FA1', '\u0FA2', '\u0FA3', '\u0FA4', '\u0FA5', '\u0FA6', '\u0FA7', '\u0FA8', '\u0FA9', '\u0FAA', '\u0FAB', '\u0FAC', '\u0FAD', '\u0FAE', '\u0FAF', '\u0FB0', '\u0FB1', '\u0FB2', '\u0FB3', '\u0FB4', '\u0FB5', '\u0FB6', '\u0FB7', '\u0FB8', '\u0FB9', '\u0FBA', '\u0FBB', '\u0FBC', '\u0FC6', '\u102D', '\u102E', '\u102F', '\u1030', '\u1032', '\u1033', '\u1034', '\u1035', '\u1036', '\u1037', '\u1039', '\u103A', '\u103D', '\u103E', '\u1058', '\u1059', '\u105E', '\u105F', '\u1060', '\u1071', '\u1072', '\u1073', '\u1074', '\u1082', '\u1085', '\u1086', '\u108D', '\u109D', '\u135D', '\u135E', '\u135F', '\u1712', '\u1713', '\u1714', '\u1732', '\u1733', '\u1734', '\u1752', '\u1753', '\u1772', '\u1773', '\u17B4', '\u17B5', '\u17B7', '\u17B8', '\u17B9', '\u17BA', '\u17BB', '\u17BC', '\u17BD', '\u17C6', '\u17C9', '\u17CA', '\u17CB', '\u17CC', '\u17CD', '\u17CE', '\u17CF', '\u17D0', '\u17D1', '\u17D2', '\u17D3', '\u17DD', '\u180B', '\u180C', '\u180D', '\u18A9', '\u1920', '\u1921', '\u1922', '\u1927', '\u1928', '\u1932', '\u1939', '\u193A', '\u193B', '\u1A17', '\u1A18', '\u1A56', '\u1A58', '\u1A59', '\u1A5A', '\u1A5B', '\u1A5C', '\u1A5D', '\u1A5E', '\u1A60', '\u1A62', '\u1A65', '\u1A66', '\u1A67', '\u1A68', '\u1A69', '\u1A6A', '\u1A6B', '\u1A6C', '\u1A73', '\u1A74', '\u1A75', '\u1A76', '\u1A77', '\u1A78', '\u1A79', '\u1A7A', '\u1A7B', '\u1A7C', '\u1A7F', '\u1B00', '\u1B01', '\u1B02', '\u1B03', '\u1B34', '\u1B36', '\u1B37', '\u1B38', '\u1B39', '\u1B3A', '\u1B3C', '\u1B42', '\u1B6B', '\u1B6C', '\u1B6D', '\u1B6E', '\u1B6F', '\u1B70', '\u1B71', '\u1B72', '\u1B73', '\u1B80', '\u1B81', '\u1BA2', '\u1BA3', '\u1BA4', '\u1BA5', '\u1BA8', '\u1BA9', '\u1BAB', '\u1BE6', '\u1BE8', '\u1BE9', '\u1BED', '\u1BEF', '\u1BF0', '\u1BF1', '\u1C2C', '\u1C2D', '\u1C2E', '\u1C2F', '\u1C30', '\u1C31', '\u1C32', '\u1C33', '\u1C36', '\u1C37', '\u1CD0', '\u1CD1', '\u1CD2', '\u1CD4', '\u1CD5', '\u1CD6', '\u1CD7', '\u1CD8', '\u1CD9', '\u1CDA', '\u1CDB', '\u1CDC', '\u1CDD', '\u1CDE', '\u1CDF', '\u1CE0', '\u1CE2', '\u1CE3', '\u1CE4', '\u1CE5', '\u1CE6', '\u1CE7', '\u1CE8', '\u1CED', '\u1CF4', '\u1DC0', '\u1DC1', '\u1DC2', '\u1DC3', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCB', '\u1DCC', '\u1DCD', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5', '\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1', '\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\u1DFC', '\u1DFD', '\u1DFE', '\u1DFF', '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DB', '\u20DC', '\u20E1', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9', '\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EE', '\u20EF', '\u20F0', '\u2CEF', '\u2CF0', '\u2CF1', '\u2D7F', '\u2DE0', '\u2DE1', '\u2DE2', '\u2DE3', '\u2DE4', '\u2DE5', '\u2DE6', '\u2DE7', '\u2DE8', '\u2DE9', '\u2DEA', '\u2DEB', '\u2DEC', '\u2DED', '\u2DEE', '\u2DEF', '\u2DF0', '\u2DF1', '\u2DF2', '\u2DF3', '\u2DF4', '\u2DF5', '\u2DF6', '\u2DF7', '\u2DF8', '\u2DF9', '\u2DFA', '\u2DFB', '\u2DFC', '\u2DFD', '\u2DFE', '\u2DFF', '\u302A', '\u302B', '\u302C', '\u302D', '\u3099', '\u309A', '\uA66F', '\uA674', '\uA675', '\uA676', '\uA677', '\uA678', '\uA679', '\uA67A', '\uA67B', '\uA67C', '\uA67D', '\uA69F', '\uA6F0', '\uA6F1', '\uA802', '\uA806', '\uA80B', '\uA825', '\uA826', '\uA8C4', '\uA8E0', '\uA8E1', '\uA8E2', '\uA8E3', '\uA8E4', '\uA8E5', '\uA8E6', '\uA8E7', '\uA8E8', '\uA8E9', '\uA8EA', '\uA8EB', '\uA8EC', '\uA8ED', '\uA8EE', '\uA8EF', '\uA8F0', '\uA8F1', '\uA926', '\uA927', '\uA928', '\uA929', '\uA92A', '\uA92B', '\uA92C', '\uA92D', '\uA947', '\uA948', '\uA949', '\uA94A', '\uA94B', '\uA94C', '\uA94D', '\uA94E', '\uA94F', '\uA950', '\uA951', '\uA980', '\uA981', '\uA982', '\uA9B3', '\uA9B6', '\uA9B7', '\uA9B8', '\uA9B9', '\uA9BC', '\uAA29', '\uAA2A', '\uAA2B', '\uAA2C', '\uAA2D', '\uAA2E', '\uAA31', '\uAA32', '\uAA35', '\uAA36', '\uAA43', '\uAA4C', '\uAAB0', '\uAAB2', '\uAAB3', '\uAAB4', '\uAAB7', '\uAAB8', '\uAABE', '\uAABF', '\uAAC1', '\uAAEC', '\uAAED', '\uAAF6', '\uABE5', '\uABE8', '\uABED', '\uFB1E', '\uFE00', '\uFE01', '\uFE02', '\uFE03', '\uFE04', '\uFE05', '\uFE06', '\uFE07', '\uFE08', '\uFE09', '\uFE0A', '\uFE0B', '\uFE0C', '\uFE0D', '\uFE0E', '\uFE0F', '\uFE20', '\uFE21', '\uFE22', '\uFE23', '\uFE24', '\uFE25', '\uFE26', '\U000101FD', '\U00010A01', '\U00010A02', '\U00010A03', '\U00010A05', '\U00010A06', '\U00010A0C', '\U00010A0D', '\U00010A0E', '\U00010A0F', '\U00010A38', '\U00010A39', '\U00010A3A', '\U00010A3F', '\U00011001', '\U00011038', '\U00011039', '\U0001103A', '\U0001103B', '\U0001103C', '\U0001103D', '\U0001103E', '\U0001103F', '\U00011040', '\U00011041', '\U00011042', '\U00011043', '\U00011044', '\U00011045', '\U00011046', '\U00011080', '\U00011081', '\U000110B3', '\U000110B4', '\U000110B5', '\U000110B6', '\U000110B9', '\U000110BA', '\U00011100', '\U00011101', '\U00011102', '\U00011127', '\U00011128', '\U00011129', '\U0001112A', '\U0001112B', '\U0001112D', '\U0001112E', '\U0001112F', '\U00011130', '\U00011131', '\U00011132', '\U00011133', '\U00011134', '\U00011180', '\U00011181', '\U000111B6', '\U000111B7', '\U000111B8', '\U000111B9', '\U000111BA', '\U000111BB', '\U000111BC', '\U000111BD', '\U000111BE', '\U000116AB', '\U000116AD', '\U000116B0', '\U000116B1', '\U000116B2', '\U000116B3', '\U000116B4', '\U000116B5', '\U000116B7', '\U00016F8F', '\U00016F90', '\U00016F91', '\U00016F92', '\U0001D167', '\U0001D168', '\U0001D169', '\U0001D17B', '\U0001D17C', '\U0001D17D', '\U0001D17E', '\U0001D17F', '\U0001D180', '\U0001D181', '\U0001D182', '\U0001D185', '\U0001D186', '\U0001D187', '\U0001D188', '\U0001D189', '\U0001D18A', '\U0001D18B', '\U0001D1AA', '\U0001D1AB', '\U0001D1AC', '\U0001D1AD', '\U0001D242', '\U0001D243', '\U0001D244', '\U000E0100', '\U000E0101', '\U000E0102', '\U000E0103', '\U000E0104', '\U000E0105', '\U000E0106', '\U000E0107', '\U000E0108', '\U000E0109', '\U000E010A', '\U000E010B', '\U000E010C', '\U000E010D', '\U000E010E', '\U000E010F', '\U000E0110', '\U000E0111', '\U000E0112', '\U000E0113', '\U000E0114', '\U000E0115', '\U000E0116', '\U000E0117', '\U000E0118', '\U000E0119', '\U000E011A', '\U000E011B', '\U000E011C', '\U000E011D', '\U000E011E', '\U000E011F', '\U000E0120', '\U000E0121', '\U000E0122', '\U000E0123', '\U000E0124', '\U000E0125', '\U000E0126', '\U000E0127', '\U000E0128', '\U000E0129', '\U000E012A', '\U000E012B', '\uE012C', '\U000E012D', '\U000E012E', '\U000E012F', '\U000E0130', '\U000E0131', '\U000E0132', '\U000E0133', '\U000E0134', '\U000E0135', '\U000E0136', '\U000E0137', '\U000E0138', '\U000E0139', '\U000E013A', '\U000E013B', '\U000E013C', '\U000E013D', '\U000E013E', '\U000E013F', '\U000E0140', '\U000E0141', '\U000E0142', '\U000E0143', '\U000E0144', '\U000E0145', '\U000E0146', '\U000E0147', '\U000E0148', '\U000E0149', '\U000E014A', '\U000E014B', '\U000E014C', '\U000E014D', '\U000E014E', '\U000E014F', '\U000E0150', '\U000E0151', '\U000E0152', '\U000E0153', '\U000E0154', '\U000E0155', '\U000E0156', '\U000E0157', '\U000E0158', '\U000E0159', '\U000E015A', '\U000E015B', '\U000E015C', '\U000E015D', '\U000E015E', '\U000E015F', '\U000E0160', '\U000E0161', '\U000E0162', '\U000E0163', '\U000E0164', '\U000E0165', '\U000E0166', '\U000E0167', '\U000E0168', '\U000E0169', '\U000E016A', '\U000E016B', '\U000E016C', '\U000E016D', '\U000E016E', '\U000E016F', '\U000E0170', '\U000E0171', '\U000E0172', '\U000E0173', '\U000E0174', '\U000E0175', '\U000E0176', '\U000E0177', '\U000E0178', '\U000E0179', '\U000E017A', '\U000E017B', '\U000E017C', '\U000E017D', '\U000E017E', '\U000E017F', '\U000E0180', '\U000E0181', '\U000E0182', '\U000E0183', '\U000E0184', '\U000E0185', '\uE0186', '\U000E0187', '\U000E0188', '\U000E0189', '\U000E018A', '\U000E018B', '\U000E018C', '\U000E018D', '\U000E018E', '\U000E018F', '\U000E0190', '\U000E0191', '\U000E0192', '\U000E0193', '\U000E0194', '\U000E0195', '\U000E0196', '\U000E0197', '\U000E0198', '\U000E0199', '\U000E019A', '\U000E019B', '\U000E019C', '\U000E019D', '\U000E019E', '\U000E019F', '\U000E01A0', '\U000E01A1', '\U000E01A2', '\U000E01A3', '\U000E01A4', '\U000E01A5', '\U000E01A6', '\U000E01A7', '\U000E01A8', '\U000E01A9', '\U000E01AA', '\U000E01AB', '\U000E01AC', '\U000E01AD', '\U000E01AE', '\U000E01AF', '\U000E01B0', '\U000E01B1', '\U000E01B2', '\U000E01B3', '\U000E01B4', '\U000E01B5', '\U000E01B6', '\U000E01B7', '\U000E01B8', '\U000E01B9', '\U000E01BA', '\U000E01BB', '\U000E01BC', '\U000E01BD', '\U000E01BE', '\U000E01BF', '\U000E01C0', '\U000E01C1', '\U000E01C2', '\U000E01C3', '\U000E01C4', '\U000E01C5', '\U000E01C6', '\U000E01C7', '\U000E01C8', '\U000E01C9', '\U000E01CA', '\U000E01CB', '\U000E01CC', '\U000E01CD', '\U000E01CE', '\U000E01CF', '\U000E01D0', '\U000E01D1', '\U000E01D2', '\U000E01D3', '\U000E01D4', '\U000E01D5', '\U000E01D6', '\U000E01D7', '\U000E01D8', '\U000E01D9', '\U000E01DA', '\U000E01DB', '\U000E01DC', '\U000E01DD', '\U000E01DE', '\U000E01DF', '\U000E01E0', '\U000E01E1', '\U000E01E2', '\U000E01E3', '\U000E01E4', '\U000E01E5', '\U000E01E6', '\U000E01E7', '\U000E01E8', '\U000E01E9', '\U000E01EA', '\U000E01EB', '\U000E01EC', '\U000E01ED', '\U000E01EE', '\U000E01EF'];
NOTE.
Please note that we have both \U000XXXXX and \uXXXX representations here.
I want to count the Unicode input text like this Hindi string "अब यहां से कहा जाएँ हम" or just a token word like "समझा", excluding the non spacing characters.
My implementation looks like
def countNonSpacingCharString(str):
count = 0;
for char in str:
if char not in UNICODE_NSM:
count = count + 1
return count
Thanks to the help provided in the answers below I have put all together in this github. There is also a mark codepoints list ready to be used in JavaScript / Node.js - https://github.com/loretoparisi/unicode_marks
Fastest way I came up with. len was slightly faster than sum. I built a set of all combining mark types in the setup.
test.py:
import sys
from unicodedata import category
MARK_SET = set(chr(c) for c in range(sys.maxunicode + 1) if category(chr(c))[0] == 'M')
s = "अब यहां से कहा जाएँ हम"
def count_len(s):
return len([c for c in s if c not in MARK_SET])
def count_sum(s):
return sum([c not in MARK_SET for c in s])
if __name__ == '__main__':
print(len(s))
print(count_len(s))
print(count_sum(s))
Output:
22
16
16
Timings:
C:\>py -m timeit -s "from test import count_sum,s" "count_sum(s)"
50000 loops, best of 5: 4.62 usec per loop
C:\>py -m timeit -s "from test import count_len,s" "count_len(s)"
50000 loops, best of 5: 3.97 usec per loop
It's worth noting that there is a grapheme 3rd party library. grapheme.length(s) == 16, but it was much slower (118us). The full grapheme-detecting algorithm is more complicated than skipping the modifier category. Consider the combining emojis for families and skin colors.
See also Unicode Text Segmentation.
This might be a better alternative:
def countNonSpacingCharString(str):
return len([char for char in str if not(char in UNICODE_NSM)])
How about using a dictionary to look up the values and if not present, increment the count? It should be faster than the former approach because the time complexity to check the presence of the character reduces to O(1).
The implementation should look somewhat like this:
Create a dict and populate it:
lookup_dict = {}
for alpha in UNICODE_NSM:
lookup_dict[alpha] = 1
Look it up while looping through the string:
def countNonSpacingCharString(str):
count = 0;
for char in str:
start_time = time.time()
if not lookup_dict.get(char):
count = count + 1
print("--- %s seconds ---" % (time.time() - start_time))
return count
I must note that using str, as variable name in Python is bad idea, as it is name of built-in function. Anyway I would implement your function following way:
def countNonSpacingCharString(s):
return len(filter(lambda x:not x in UNICODE_NSM,s))
in Python 2
def countNonSpacingCharString(s):
return sum(1 for _ in filter(lambda x:not x in UNICODE_NSM,s))
in Python 3
Inspecting my function using dis.dis showed that it produced less bytecode than your version with count, thus suggesting it might be faster, though this need further investigation.
EDIT: I tested my code in Python 2, but not Python 3 - version for Python 3 added, using Mohammad Banisaeid answer from this topic.
EDIT 2: If you uses UNICODE_NSM only for that, you might try to use set instead of list, which should boost in operator, though again this need further investigation. For discussion about list vs set performance see this thread.
Perhaps the easiest way to do this is to use the unicodedata module. In part, because it will be more rigorously tested. Indeed, I found your list appeared to be including categories other than Mn. That is, it includes Unicode points from Mc (Mark, spacing combining) as well, but you said you only wanted to exclude Unicode points from Mn (Mark, Nonspacing).
eg.
import unicodedata
def countNonSpacingCharString(string):
category = unicodedata.category
return sum(category(char) != 'Mn' for char in string)
This appears to be about 60 times faster according to timeit.
You might get a TypeError, if your version of Python and therefore unicodedata is not up-to-date, and so not aware of recent additions to Unicode. You can get around this by installing unicodedata2 and using that instead.
From your comments it looks like you're really after counting "user perceived characters". This is a complicated process with a number of edge cases. If you can then you should to install regex on your environment (that would be micropython?). You can then do:
>>> parts = regex.findall(r'\X', 'अब यहां से कहा जाएँ हम')
>>> parts
['अ', 'ब', ' ', 'य', 'हां', ' ', 'से', ' ', 'क', 'हा', ' ', 'जा', 'एँ', ' ', 'ह', 'म']
>>> len(parts)
16
Which splits your string into "user perceived characters", and then you can work on this list of strings to get what you need.
Failing that, your current solution of just ignoring Mark code points is an 80/20 solution (gets you most of the way their for the least amount of effort). You will have to revise what your list of Unicode marks though. My tests showed that your list was missing 113 code points across all the Indo-European and Dravidian scripts in Unicode (Devanagari, Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam, and Sinhala).
I extracted these characters by downloading and parsing: https://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt with the following code:
indian_script_range = range(0x0900, 0x0E00) # doesn't include all indic scripts (eg. Thai)
basic_multilingual_plane = range(0x0000, 0x10000)
# use the latter if you want to be more thorough and include all indic scripts and non-indic scripts
codepoint_range = indian_script_range
codepoints = []
with open('UnicodeData.txt') as f:
for line in f:
hex_string, name, category, *rest = line.strip().split(';')
codepoint_number = int(hex_string, base=16)
if (
category in ('Mn', 'Mc', 'Me')
and (
codepoint_number in codepoint_range
or name.startswith('VARIATION SELECTOR') # you seemed to want to include these
)
):
codepoints.append(chr(codepoint_number))
missing = set(codepoints) - set(UNICODE_NSM)
Mark Tolonens answer is the fastest, because it uses a set for comparison. If you have a text of length n and m whitespace-characters to compare with, then your worst-case runtime using two lists is O(nm). Using a set for the whitespace characters reduces that to O(n).
Using unicodedata.category is just nicer because it is shorter and less prone to human error.
Performance comparison
You can clearly see that the markset_count and the category_count are way faster than the generator_count and the loop_count. Also the speed of the latter two varies way more. Interestingly, the generator_count is slower than the loop_count.
The markset_count is a bit faster than the category_count. I think that is the case because looking up the category and doing the string comparison also takes a bit of time. The difference is way more clear when you only plot the two and increase the text length:
import timeit
import sys
import unicodedata
import numpy as np
UNICODE_NSM = ['\u0300', '\u0301', '\u0302', '\u0303', '\u0304', '\u0305', '\u0306', '\u0307', '\u0308', '\u0309', '\u030A', '\u030B', '\u030C', '\u030D', '\u030E', '\u030F', '\u0310', '\u0311', '\u0312', '\u0313', '\u0314', '\u0315', '\u0316', '\u0317', '\u0318', '\u0319', '\u031A', '\u031B', '\u031C', '\u031D', '\u031E', '\u031F', '\u0320', '\u0321', '\u0322', '\u0323', '\u0324', '\u0325', '\u0326', '\u0327', '\u0328', '\u0329', '\u032A', '\u032B', '\u032C', '\u032D', '\u032E', '\u032F', '\u0330', '\u0331', '\u0332', '\u0333', '\u0334', '\u0335', '\u0336', '\u0337', '\u0338', '\u0339', '\u033A', '\u033B', '\u033C', '\u033D', '\u033E', '\u033F', '\u0340', '\u0341', '\u0342', '\u0343', '\u0344', '\u0345', '\u0346', '\u0347', '\u0348', '\u0349', '\u034A', '\u034B', '\u034C', '\u034D', '\u034E', '\u034F', '\u0350', '\u0351', '\u0352', '\u0353', '\u0354', '\u0355', '\u0356', '\u0357', '\u0358', '\u0359', '\u035A', '\u035B', '\u035C', '\u035D', '\u035E', '\u035F', '\u0360', '\u0361', '\u0362', '\u0363', '\u0364', '\u0365', '\u0366', '\u0367', '\u0368', '\u0369', '\u036A', '\u036B', '\u036C', '\u036D', '\u036E', '\u036F', '\u0483', '\u0484', '\u0485', '\u0486', '\u0487', '\u0591', '\u0592', '\u0593', '\u0594', '\u0595', '\u0596', '\u0597', '\u0598', '\u0599', '\u059A', '\u059B', '\u059C', '\u059D', '\u059E', '\u059F', '\u05A0', '\u05A1', '\u05A2', '\u05A3', '\u05A4', '\u05A5', '\u05A6', '\u05A7', '\u05A8', '\u05A9', '\u05AA', '\u05AB', '\u05AC', '\u05AD', '\u05AE', '\u05AF', '\u05B0', '\u05B1', '\u05B2', '\u05B3', '\u05B4', '\u05B5', '\u05B6', '\u05B7', '\u05B8', '\u05B9', '\u05BA', '\u05BB', '\u05BC', '\u05BD', '\u05BF', '\u05C1', '\u05C2', '\u05C4', '\u05C5', '\u05C7', '\u0610', '\u0611', '\u0612', '\u0613', '\u0614', '\u0615', '\u0616', '\u0617', '\u0618', '\u0619', '\u061A', '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C', '\u065D', '\u065E', '\u065F', '\u0670', '\u06D6', '\u06D7', '\u06D8', '\u06D9', '\u06DA', '\u06DB', '\u06DC', '\u06DF', '\u06E0', '\u06E1', '\u06E2', '\u06E3', '\u06E4', '\u06E7', '\u06E8', '\u06EA', '\u06EB', '\u06EC', '\u06ED', '\u0711', '\u0730', '\u0731', '\u0732', '\u0733', '\u0734', '\u0735', '\u0736', '\u0737', '\u0738', '\u0739', '\u073A', '\u073B', '\u073C', '\u073D', '\u073E', '\u073F', '\u0740', '\u0741', '\u0742', '\u0743', '\u0744', '\u0745', '\u0746', '\u0747', '\u0748', '\u0749', '\u074A', '\u07A6', '\u07A7', '\u07A8', '\u07A9', '\u07AA', '\u07AB', '\u07AC', '\u07AD', '\u07AE', '\u07AF', '\u07B0', '\u07EB', '\u07EC', '\u07ED', '\u07EE', '\u07EF', '\u07F0', '\u07F1', '\u07F2', '\u07F3', '\u0816', '\u0817', '\u0818', '\u0819', '\u081B', '\u081C', '\u081D', '\u081E', '\u081F', '\u0820', '\u0821', '\u0822', '\u0823', '\u0825', '\u0826', '\u0827', '\u0829', '\u082A', '\u082B', '\u082C', '\u082D', '\u0859', '\u085A', '\u085B', '\u08E4', '\u08E5', '\u08E6', '\u08E7', '\u08E8', '\u08E9', '\u08EA', '\u08EB', '\u08EC', '\u08ED', '\u08EE', '\u08EF', '\u08F0', '\u08F1', '\u08F2', '\u08F3', '\u08F4', '\u08F5', '\u08F6', '\u08F7', '\u08F8', '\u08F9', '\u08FA', '\u08FB', '\u08FC', '\u08FD', '\u08FE', '\u0900', '\u0901', '\u0902', '\u093A', '\u093C', '\u093E', '\u0941', '\u0942', '\u0943', '\u0944', '\u0945', '\u0946', '\u0947', '\u0948', '\u094D', '\u0951', '\u0952', '\u0953', '\u0954', '\u0955', '\u0956', '\u0957', '\u0962', '\u0963', '\u0981', '\u09BC', '\u09C1', '\u09C2', '\u09C3', '\u09C4', '\u09CD', '\u09E2', '\u09E3', '\u0A01', '\u0A02', '\u0A3C', '\u0A41', '\u0A42', '\u0A47', '\u0A48', '\u0A4B', '\u0A4C', '\u0A4D', '\u0A51', '\u0A70', '\u0A71', '\u0A75', '\u0A81', '\u0A82', '\u0ABC', '\u0AC1', '\u0AC2', '\u0AC3', '\u0AC4', '\u0AC5', '\u0AC7', '\u0AC8', '\u0ACD', '\u0AE2', '\u0AE3', '\u0B01', '\u0B3C', '\u0B3F', '\u0B41', '\u0B42', '\u0B43', '\u0B44', '\u0B4D', '\u0B56', '\u0B62', '\u0B63', '\u0B82', '\u0BC0', '\u0BCD', '\u0C3E', '\u0C3F', '\u0C40', '\u0C46', '\u0C47', '\u0C48', '\u0C4A', '\u0C4B', '\u0C4C', '\u0C4D', '\u0C55', '\u0C56', '\u0C62', '\u0C63', '\u0CBC', '\u0CBF', '\u0CC6', '\u0CCC', '\u0CCD', '\u0CE2', '\u0CE3', '\u0D41', '\u0D42', '\u0D43', '\u0D44', '\u0D4D', '\u0D62', '\u0D63', '\u0DCA', '\u0DD2', '\u0DD3', '\u0DD4', '\u0DD6', '\u0E31', '\u0E34', '\u0E35', '\u0E36', '\u0E37', '\u0E38', '\u0E39', '\u0E3A', '\u0E47', '\u0E48', '\u0E49', '\u0E4A', '\u0E4B', '\u0E4C', '\u0E4D', '\u0E4E', '\u0EB1', '\u0EB4', '\u0EB5', '\u0EB6', '\u0EB7', '\u0EB8', '\u0EB9', '\u0EBB', '\u0EBC', '\u0EC8', '\u0EC9', '\u0ECA', '\u0ECB', '\u0ECC', '\u0ECD', '\u0F18', '\u0F19', '\u0F35', '\u0F37', '\u0F39', '\u0F71', '\u0F72', '\u0F73', '\u0F74', '\u0F75', '\u0F76', '\u0F77', '\u0F78', '\u0F79', '\u0F7A', '\u0F7B', '\u0F7C', '\u0F7D', '\u0F7E', '\u0F80', '\u0F81', '\u0F82', '\u0F83', '\u0F84', '\u0F86', '\u0F87', '\u0F8D', '\u0F8E', '\u0F8F', '\u0F90', '\u0F91', '\u0F92', '\u0F93', '\u0F94', '\u0F95', '\u0F96', '\u0F97', '\u0F99', '\u0F9A', '\u0F9B', '\u0F9C', '\u0F9D', '\u0F9E', '\u0F9F', '\u0FA0', '\u0FA1', '\u0FA2', '\u0FA3', '\u0FA4', '\u0FA5', '\u0FA6', '\u0FA7', '\u0FA8', '\u0FA9', '\u0FAA', '\u0FAB', '\u0FAC', '\u0FAD', '\u0FAE', '\u0FAF', '\u0FB0', '\u0FB1', '\u0FB2', '\u0FB3', '\u0FB4', '\u0FB5', '\u0FB6', '\u0FB7', '\u0FB8', '\u0FB9', '\u0FBA', '\u0FBB', '\u0FBC', '\u0FC6', '\u102D', '\u102E', '\u102F', '\u1030', '\u1032', '\u1033', '\u1034', '\u1035', '\u1036', '\u1037', '\u1039', '\u103A', '\u103D', '\u103E', '\u1058', '\u1059', '\u105E', '\u105F', '\u1060', '\u1071', '\u1072', '\u1073', '\u1074', '\u1082', '\u1085', '\u1086', '\u108D', '\u109D', '\u135D', '\u135E', '\u135F', '\u1712', '\u1713', '\u1714', '\u1732', '\u1733', '\u1734', '\u1752', '\u1753', '\u1772', '\u1773', '\u17B4', '\u17B5', '\u17B7', '\u17B8', '\u17B9', '\u17BA', '\u17BB', '\u17BC', '\u17BD', '\u17C6', '\u17C9', '\u17CA', '\u17CB', '\u17CC', '\u17CD', '\u17CE', '\u17CF', '\u17D0', '\u17D1', '\u17D2', '\u17D3', '\u17DD', '\u180B', '\u180C', '\u180D', '\u18A9', '\u1920', '\u1921', '\u1922', '\u1927', '\u1928', '\u1932', '\u1939', '\u193A', '\u193B', '\u1A17', '\u1A18', '\u1A56', '\u1A58', '\u1A59', '\u1A5A', '\u1A5B', '\u1A5C', '\u1A5D', '\u1A5E', '\u1A60', '\u1A62', '\u1A65', '\u1A66', '\u1A67', '\u1A68', '\u1A69', '\u1A6A', '\u1A6B', '\u1A6C', '\u1A73', '\u1A74', '\u1A75', '\u1A76', '\u1A77', '\u1A78', '\u1A79', '\u1A7A', '\u1A7B', '\u1A7C', '\u1A7F', '\u1B00', '\u1B01', '\u1B02', '\u1B03', '\u1B34', '\u1B36', '\u1B37', '\u1B38', '\u1B39', '\u1B3A', '\u1B3C', '\u1B42', '\u1B6B', '\u1B6C', '\u1B6D', '\u1B6E', '\u1B6F', '\u1B70', '\u1B71', '\u1B72', '\u1B73', '\u1B80', '\u1B81', '\u1BA2', '\u1BA3', '\u1BA4', '\u1BA5', '\u1BA8', '\u1BA9', '\u1BAB', '\u1BE6', '\u1BE8', '\u1BE9', '\u1BED', '\u1BEF', '\u1BF0', '\u1BF1', '\u1C2C', '\u1C2D', '\u1C2E', '\u1C2F', '\u1C30', '\u1C31', '\u1C32', '\u1C33', '\u1C36', '\u1C37', '\u1CD0', '\u1CD1', '\u1CD2', '\u1CD4', '\u1CD5', '\u1CD6', '\u1CD7', '\u1CD8', '\u1CD9', '\u1CDA', '\u1CDB', '\u1CDC', '\u1CDD', '\u1CDE', '\u1CDF', '\u1CE0', '\u1CE2', '\u1CE3', '\u1CE4', '\u1CE5', '\u1CE6', '\u1CE7', '\u1CE8', '\u1CED', '\u1CF4', '\u1DC0', '\u1DC1', '\u1DC2', '\u1DC3', '\u1DC4', '\u1DC5', '\u1DC6', '\u1DC7', '\u1DC8', '\u1DC9', '\u1DCA', '\u1DCB', '\u1DCC', '\u1DCD', '\u1DCE', '\u1DCF', '\u1DD0', '\u1DD1', '\u1DD2', '\u1DD3', '\u1DD4', '\u1DD5', '\u1DD6', '\u1DD7', '\u1DD8', '\u1DD9', '\u1DDA', '\u1DDB', '\u1DDC', '\u1DDD', '\u1DDE', '\u1DDF', '\u1DE0', '\u1DE1', '\u1DE2', '\u1DE3', '\u1DE4', '\u1DE5', '\u1DE6', '\u1DFC', '\u1DFD', '\u1DFE', '\u1DFF', '\u20D0', '\u20D1', '\u20D2', '\u20D3', '\u20D4', '\u20D5', '\u20D6', '\u20D7', '\u20D8', '\u20D9', '\u20DA', '\u20DB', '\u20DC', '\u20E1', '\u20E5', '\u20E6', '\u20E7', '\u20E8', '\u20E9', '\u20EA', '\u20EB', '\u20EC', '\u20ED', '\u20EE', '\u20EF', '\u20F0', '\u2CEF', '\u2CF0', '\u2CF1', '\u2D7F', '\u2DE0', '\u2DE1', '\u2DE2', '\u2DE3', '\u2DE4', '\u2DE5', '\u2DE6', '\u2DE7', '\u2DE8', '\u2DE9', '\u2DEA', '\u2DEB', '\u2DEC', '\u2DED', '\u2DEE', '\u2DEF', '\u2DF0', '\u2DF1', '\u2DF2', '\u2DF3', '\u2DF4', '\u2DF5', '\u2DF6', '\u2DF7', '\u2DF8', '\u2DF9', '\u2DFA', '\u2DFB', '\u2DFC', '\u2DFD', '\u2DFE', '\u2DFF', '\u302A', '\u302B', '\u302C', '\u302D', '\u3099', '\u309A', '\uA66F', '\uA674', '\uA675', '\uA676', '\uA677', '\uA678', '\uA679', '\uA67A', '\uA67B', '\uA67C', '\uA67D', '\uA69F', '\uA6F0', '\uA6F1', '\uA802', '\uA806', '\uA80B', '\uA825', '\uA826', '\uA8C4', '\uA8E0', '\uA8E1', '\uA8E2', '\uA8E3', '\uA8E4', '\uA8E5', '\uA8E6', '\uA8E7', '\uA8E8', '\uA8E9', '\uA8EA', '\uA8EB', '\uA8EC', '\uA8ED', '\uA8EE', '\uA8EF', '\uA8F0', '\uA8F1', '\uA926', '\uA927', '\uA928', '\uA929', '\uA92A', '\uA92B', '\uA92C', '\uA92D', '\uA947', '\uA948', '\uA949', '\uA94A', '\uA94B', '\uA94C', '\uA94D', '\uA94E', '\uA94F', '\uA950', '\uA951', '\uA980', '\uA981', '\uA982', '\uA9B3', '\uA9B6', '\uA9B7', '\uA9B8', '\uA9B9', '\uA9BC', '\uAA29', '\uAA2A', '\uAA2B', '\uAA2C', '\uAA2D', '\uAA2E', '\uAA31', '\uAA32', '\uAA35', '\uAA36', '\uAA43', '\uAA4C', '\uAAB0', '\uAAB2', '\uAAB3', '\uAAB4', '\uAAB7', '\uAAB8', '\uAABE', '\uAABF', '\uAAC1', '\uAAEC', '\uAAED', '\uAAF6', '\uABE5', '\uABE8', '\uABED', '\uFB1E', '\uFE00', '\uFE01', '\uFE02', '\uFE03', '\uFE04', '\uFE05', '\uFE06', '\uFE07', '\uFE08', '\uFE09', '\uFE0A', '\uFE0B', '\uFE0C', '\uFE0D', '\uFE0E', '\uFE0F', '\uFE20', '\uFE21', '\uFE22', '\uFE23', '\uFE24', '\uFE25', '\uFE26', '\U000101FD', '\U00010A01', '\U00010A02', '\U00010A03', '\U00010A05', '\U00010A06', '\U00010A0C', '\U00010A0D', '\U00010A0E', '\U00010A0F', '\U00010A38', '\U00010A39', '\U00010A3A', '\U00010A3F', '\U00011001', '\U00011038', '\U00011039', '\U0001103A', '\U0001103B', '\U0001103C', '\U0001103D', '\U0001103E', '\U0001103F', '\U00011040', '\U00011041', '\U00011042', '\U00011043', '\U00011044', '\U00011045', '\U00011046', '\U00011080', '\U00011081', '\U000110B3', '\U000110B4', '\U000110B5', '\U000110B6', '\U000110B9', '\U000110BA', '\U00011100', '\U00011101', '\U00011102', '\U00011127', '\U00011128', '\U00011129', '\U0001112A', '\U0001112B', '\U0001112D', '\U0001112E', '\U0001112F', '\U00011130', '\U00011131', '\U00011132', '\U00011133', '\U00011134', '\U00011180', '\U00011181', '\U000111B6', '\U000111B7', '\U000111B8', '\U000111B9', '\U000111BA', '\U000111BB', '\U000111BC', '\U000111BD', '\U000111BE', '\U000116AB', '\U000116AD', '\U000116B0', '\U000116B1', '\U000116B2', '\U000116B3', '\U000116B4', '\U000116B5', '\U000116B7', '\U00016F8F', '\U00016F90', '\U00016F91', '\U00016F92', '\U0001D167', '\U0001D168', '\U0001D169', '\U0001D17B', '\U0001D17C', '\U0001D17D', '\U0001D17E', '\U0001D17F', '\U0001D180', '\U0001D181', '\U0001D182', '\U0001D185', '\U0001D186', '\U0001D187', '\U0001D188', '\U0001D189', '\U0001D18A', '\U0001D18B', '\U0001D1AA', '\U0001D1AB', '\U0001D1AC', '\U0001D1AD', '\U0001D242', '\U0001D243', '\U0001D244', '\U000E0100', '\U000E0101', '\U000E0102', '\U000E0103', '\U000E0104', '\U000E0105', '\U000E0106', '\U000E0107', '\U000E0108', '\U000E0109', '\U000E010A', '\U000E010B', '\U000E010C', '\U000E010D', '\U000E010E', '\U000E010F', '\U000E0110', '\U000E0111', '\U000E0112', '\U000E0113', '\U000E0114', '\U000E0115', '\U000E0116', '\U000E0117', '\U000E0118', '\U000E0119', '\U000E011A', '\U000E011B', '\U000E011C', '\U000E011D', '\U000E011E', '\U000E011F', '\U000E0120', '\U000E0121', '\U000E0122', '\U000E0123', '\U000E0124', '\U000E0125', '\U000E0126', '\U000E0127', '\U000E0128', '\U000E0129', '\U000E012A', '\U000E012B', '\uE012C', '\U000E012D', '\U000E012E', '\U000E012F', '\U000E0130', '\U000E0131', '\U000E0132', '\U000E0133', '\U000E0134', '\U000E0135', '\U000E0136', '\U000E0137', '\U000E0138', '\U000E0139', '\U000E013A', '\U000E013B', '\U000E013C', '\U000E013D', '\U000E013E', '\U000E013F', '\U000E0140', '\U000E0141', '\U000E0142', '\U000E0143', '\U000E0144', '\U000E0145', '\U000E0146', '\U000E0147', '\U000E0148', '\U000E0149', '\U000E014A', '\U000E014B', '\U000E014C', '\U000E014D', '\U000E014E', '\U000E014F', '\U000E0150', '\U000E0151', '\U000E0152', '\U000E0153', '\U000E0154', '\U000E0155', '\U000E0156', '\U000E0157', '\U000E0158', '\U000E0159', '\U000E015A', '\U000E015B', '\U000E015C', '\U000E015D', '\U000E015E', '\U000E015F', '\U000E0160', '\U000E0161', '\U000E0162', '\U000E0163', '\U000E0164', '\U000E0165', '\U000E0166', '\U000E0167', '\U000E0168', '\U000E0169', '\U000E016A', '\U000E016B', '\U000E016C', '\U000E016D', '\U000E016E', '\U000E016F', '\U000E0170', '\U000E0171', '\U000E0172', '\U000E0173', '\U000E0174', '\U000E0175', '\U000E0176', '\U000E0177', '\U000E0178', '\U000E0179', '\U000E017A', '\U000E017B', '\U000E017C', '\U000E017D', '\U000E017E', '\U000E017F', '\U000E0180', '\U000E0181', '\U000E0182', '\U000E0183', '\U000E0184', '\U000E0185', '\uE0186', '\U000E0187', '\U000E0188', '\U000E0189', '\U000E018A', '\U000E018B', '\U000E018C', '\U000E018D', '\U000E018E', '\U000E018F', '\U000E0190', '\U000E0191', '\U000E0192', '\U000E0193', '\U000E0194', '\U000E0195', '\U000E0196', '\U000E0197', '\U000E0198', '\U000E0199', '\U000E019A', '\U000E019B', '\U000E019C', '\U000E019D', '\U000E019E', '\U000E019F', '\U000E01A0', '\U000E01A1', '\U000E01A2', '\U000E01A3', '\U000E01A4', '\U000E01A5', '\U000E01A6', '\U000E01A7', '\U000E01A8', '\U000E01A9', '\U000E01AA', '\U000E01AB', '\U000E01AC', '\U000E01AD', '\U000E01AE', '\U000E01AF', '\U000E01B0', '\U000E01B1', '\U000E01B2', '\U000E01B3', '\U000E01B4', '\U000E01B5', '\U000E01B6', '\U000E01B7', '\U000E01B8', '\U000E01B9', '\U000E01BA', '\U000E01BB', '\U000E01BC', '\U000E01BD', '\U000E01BE', '\U000E01BF', '\U000E01C0', '\U000E01C1', '\U000E01C2', '\U000E01C3', '\U000E01C4', '\U000E01C5', '\U000E01C6', '\U000E01C7', '\U000E01C8', '\U000E01C9', '\U000E01CA', '\U000E01CB', '\U000E01CC', '\U000E01CD', '\U000E01CE', '\U000E01CF', '\U000E01D0', '\U000E01D1', '\U000E01D2', '\U000E01D3', '\U000E01D4', '\U000E01D5', '\U000E01D6', '\U000E01D7', '\U000E01D8', '\U000E01D9', '\U000E01DA', '\U000E01DB', '\U000E01DC', '\U000E01DD', '\U000E01DE', '\U000E01DF', '\U000E01E0', '\U000E01E1', '\U000E01E2', '\U000E01E3', '\U000E01E4', '\U000E01E5', '\U000E01E6', '\U000E01E7', '\U000E01E8', '\U000E01E9', '\U000E01EA', '\U000E01EB', '\U000E01EC', '\U000E01ED', '\U000E01EE', '\U000E01EF']
MARK_SET = set(chr(c) for c in range(sys.maxunicode + 1) if unicodedata.category(chr(c))[0] == 'M')
print('len(UNICODE_NSM) = {}'.format(len(UNICODE_NSM)))
print('len(MARK_SET) = {}'.format(len(MARK_SET)))
filepath = "UnicodeData.txt"
with open(filepath) as f:
text = f.read()
text = text[:1000]
def main():
ground_truth = loop_count(text)
functions = [(loop_count, 'loop_count'),
(generator_count, 'generator_count'),
(category_count, 'category_count'),
(markset_count, 'markset_count'),
]
functions = functions[::-1]
duration_list = {}
for func, name in functions:
is_correct = func(text) == ground_truth
durations = timeit.repeat(lambda: func(text), repeat=500, number=3)
if is_correct:
correctness = 'correct'
else:
correctness = 'NOT correct'
duration_list[name] = durations
print('{func:<20}: {correctness}, '
'min: {min:0.3f}s, mean: {mean:0.3f}s, max: {max:0.3f}s'
.format(func=name,
correctness=correctness,
min=min(durations),
mean=np.mean(durations),
max=max(durations),
))
create_boxplot(duration_list)
def create_boxplot(duration_list):
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.figure(num=None, figsize=(8, 4), dpi=300,
facecolor='w', edgecolor='k')
sns.set(style="whitegrid")
sorted_keys, sorted_vals = zip(*sorted(duration_list.items(), key=operator.itemgetter(1)))
flierprops = dict(markerfacecolor='0.75', markersize=1,
linestyle='none')
ax = sns.boxplot(data=sorted_vals, width=.3, orient='h',
flierprops=flierprops,)
ax.set(xlabel="Time in s", ylabel="")
plt.yticks(plt.yticks()[0], sorted_keys)
plt.tight_layout()
plt.savefig("output.png")
def generator_count(text):
return sum(1 for char in text if char not in UNICODE_NSM)
def loop_count(text):
# 1769137
count = 0
for char in text:
if char not in UNICODE_NSM:
count += 1
return count
def markset_count(text):
return sum(char not in MARK_SET for char in text)
def category_count(text):
return sum(unicodedata.category(char) != 'Mn' for char in text)
if __name__ == '__main__':
main()
Related
I'm trying to perform keyphrase extraction with Python, using KeyBert and pke PositionRank. You can see an extract of my code below.
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building resulting from the construction has been calculated for each stage in the life-cycle and is disclosed to investors and clients on demand" #text_cleaning(df_tassonomia.iloc[1077].text, sentence_adjustment, stop_words)
# Pke
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number = 5)
extractor.candidate_weighting(window = 10)
keyphrases = extractor.get_n_best(n=10)
print(keyphrases)
# KeyBert
kw_model = KeyBERT(model = "all-mpnet-base-v2")
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range = (1,5),
top_n=10
)
print("")
print(keyphrases_2)
and here the results:
[('cycle global warming potential', 0.44829175082921835), ('life', 0.17858359644549557), ('cycle', 0.15775994057934534), ('building', 0.09131084381406684), ('construction', 0.08860454878871142), ('investors', 0.05426710724030216), ('clients', 0.054111700289631526), ('stage', 0.045672396861507744), ('demand', 0.039158055731066406)]
[('cycle global warming potential', 0.5444), ('building', 0.4479), ('construction', 0.3476), ('investors', 0.1967), ('clients', 0.1519), ('demand', 0.1484), ('cycle', 0.1312), ('stage', 0.0931), ('life', 0.0847)]
I would like to handle hyphenated compound words (as life-cycle in the example) are considered as a unique word, but I cannot understand how to exclude the - from the words separators list.
Thank you in advance for any help.
Francesca
this could be a silly workaround but it may help :
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import pke
text = "The life-cycle Global Warming Potential of the building
resulting from the construction has been calculated for each stage in
the life-cycle and is disclosed to investors and clients on demand"
# Pke
tokens = text.split()
orignal = set([x for x in tokens if "_" in x])
text = text.replace("-", "_")
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number=5)
extractor.candidate_weighting(window=10)
keyphrases = extractor.get_n_best(n=10)
keyphrases_replaced = []
for pair in keyphrases:
if "_" in pair[0] and pair[0] not in orignal:
keyphrases_replaced.append((pair[0].replace("_","-"),pair[1]))
else:
keyphrases_replaced.append(pair)
print(keyphrases_replaced)
# KeyBert
keyphrases_2 = kw_model.extract_keywords(docs=text,
vectorizer=KeyphraseCountVectorizer(),
keyphrase_ngram_range=(1, 5),
top_n=10
)
print("")
print(keyphrases_2)
the out put should look like this:
[('life-cycle global warming potential', 0.5511001220016548), ('life-cycle', 0.20123353586644233), ('construction', 0.11945270995269436), ('building', 0.10637157845606555), ('investors', 0.06675114967366767), ('stage', 0.05503532672910801), ('clients', 0.0507262942318816), ('demand', 0.05056281895492815)]
I hope this help :)
The issue has been fixed in the on the latest pke updates: https://github.com/boudinfl/pke/issues/195
import pke
extractor = pke.unsupervised.TopicRank()
extractor.load_document(input='BERT is a state-of-the-art model.', language='en')
extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")
print(extractor.candidates.keys())
now returns this output:
dict_keys(['bert', 'state-of-the-art model'])
I'm brand new to python and I'm struggling how to add certain sections of a cvs file in python. I'm not allowed to use "import cvs"
I'm importing the TipJoke CVS file from https://vincentarelbundock.github.io/Rdatasets/datasets.html
This is the only code I have so far that worked and I'm at a total loss on where to go from here.
if __name__ == '__main__':
from pprint import pprint
from string import punctuation
f = open("TipJoke.csv", "r")
tipList = []
for line in f:
#deletes the quotes
line = line.replace('"', '')
tipList.append(line)
pprint(tipList[])
Output:
[',Card,Tip,Ad,Joke,None\n',
'1,None,1,0,0,1\n',
'2,Joke,1,0,1,0\n',
'3,Ad,0,1,0,0\n',
'4,None,0,0,0,1\n',
'5,None,1,0,0,1\n',
'6,None,0,0,0,1\n',
'7,Ad,0,1,0,0\n',
'8,Ad,0,1,0,0\n',
'9,None,0,0,0,1\n',
'10,None,0,0,0,1\n',
'11,None,1,0,0,1\n',
'12,Ad,0,1,0,0\n',
'13,None,0,0,0,1\n',
'14,Ad,1,1,0,0\n',
'15,Joke,1,0,1,0\n',
'16,Joke,0,0,1,0\n',
'17,Joke,1,0,1,0\n',
'18,None,0,0,0,1\n',
'19,Joke,0,0,1,0\n',
'20,None,0,0,0,1\n',
'21,Ad,1,1,0,0\n',
'22,Ad,1,1,0,0\n',
'23,Ad,0,1,0,0\n',
'24,Joke,0,0,1,0\n',
'25,Joke,1,0,1,0\n',
'26,Joke,0,0,1,0\n',
'27,None,1,0,0,1\n',
'28,Joke,1,0,1,0\n',
'29,Joke,1,0,1,0\n',
'30,None,1,0,0,1\n',
'31,Joke,0,0,1,0\n',
'32,None,1,0,0,1\n',
'33,Joke,1,0,1,0\n',
'34,Ad,0,1,0,0\n',
'35,Joke,0,0,1,0\n',
'36,Ad,1,1,0,0\n',
'37,Joke,0,0,1,0\n',
'38,Ad,0,1,0,0\n',
'39,Joke,0,0,1,0\n',
'40,Joke,0,0,1,0\n',
'41,Joke,1,0,1,0\n',
'42,None,0,0,0,1\n',
'43,None,0,0,0,1\n',
'44,Ad,0,1,0,0\n',
'45,None,0,0,0,1\n',
'46,None,0,0,0,1\n',
'47,Ad,0,1,0,0\n',
'48,Joke,0,0,1,0\n',
'49,Joke,1,0,1,0\n',
'50,None,1,0,0,1\n',
'51,None,0,0,0,1\n',
'52,Joke,1,0,1,0\n',
'53,Joke,1,0,1,0\n',
'54,Joke,0,0,1,0\n',
'55,None,1,0,0,1\n',
'56,Ad,0,1,0,0\n',
'57,Joke,0,0,1,0\n',
'58,None,0,0,0,1\n',
'59,Ad,0,1,0,0\n',
'60,Joke,1,0,1,0\n',
'61,Ad,0,1,0,0\n',
'62,None,1,0,0,1\n',
'63,Joke,0,0,1,0\n',
'64,Ad,0,1,0,0\n',
'65,Joke,0,0,1,0\n',
'66,Ad,0,1,0,0\n',
'67,Ad,0,1,0,0\n',
'68,Ad,0,1,0,0\n',
'69,None,0,0,0,1\n',
'70,Joke,1,0,1,0\n',
'71,None,1,0,0,1\n',
'72,None,0,0,0,1\n',
'73,None,0,0,0,1\n',
'74,Joke,0,0,1,0\n',
'75,Ad,1,1,0,0\n',
'76,Ad,0,1,0,0\n',
'77,Ad,1,1,0,0\n',
'78,Joke,0,0,1,0\n',
'79,Joke,0,0,1,0\n',
'80,Ad,1,1,0,0\n',
'81,Ad,0,1,0,0\n',
'82,None,0,0,0,1\n',
'83,Ad,0,1,0,0\n',
'84,Joke,0,0,1,0\n',
'85,Joke,0,0,1,0\n',
'86,Ad,1,1,0,0\n',
'87,None,1,0,0,1\n',
'88,Joke,1,0,1,0\n',
'89,Ad,0,1,0,0\n',
'90,None,0,0,0,1\n',
'91,None,0,0,0,1\n',
'92,Joke,0,0,1,0\n',
'93,Joke,0,0,1,0\n',
'94,Ad,0,1,0,0\n',
'95,Ad,0,1,0,0\n',
'96,Ad,0,1,0,0\n',
'97,Joke,1,0,1,0\n',
'98,None,0,0,0,1\n',
'99,None,0,0,0,1\n',
'100,None,1,0,0,1\n',
'101,Joke,0,0,1,0\n',
'102,Joke,0,0,1,0\n',
'103,Ad,1,1,0,0\n',
'104,Ad,0,1,0,0\n',
'105,Ad,0,1,0,0\n',
'106,Ad,1,1,0,0\n',
'107,Ad,0,1,0,0\n',
'108,None,0,0,0,1\n',
'109,Ad,0,1,0,0\n',
'110,Joke,1,0,1,0\n',
'111,None,0,0,0,1\n',
'112,Ad,0,1,0,0\n',
'113,Ad,0,1,0,0\n',
'114,None,0,0,0,1\n',
'115,Ad,0,1,0,0\n',
'116,None,0,0,0,1\n',
'117,None,0,0,0,1\n',
'118,Ad,0,1,0,0\n',
'119,None,1,0,0,1\n',
'120,Ad,1,1,0,0\n',
'121,Ad,0,1,0,0\n',
'122,Ad,1,1,0,0\n',
'123,None,0,0,0,1\n',
'124,None,0,0,0,1\n',
'125,Joke,1,0,1,0\n',
'126,Joke,1,0,1,0\n',
'127,Ad,0,1,0,0\n',
'128,Joke,0,0,1,0\n',
'129,Joke,0,0,1,0\n',
'130,Ad,0,1,0,0\n',
'131,None,0,0,0,1\n',
'132,None,0,0,0,1\n',
'133,None,0,0,0,1\n',
'134,Joke,1,0,1,0\n',
'135,Ad,0,1,0,0\n',
'136,None,0,0,0,1\n',
'137,Joke,0,0,1,0\n',
'138,Ad,0,1,0,0\n',
'139,Ad,0,1,0,0\n',
'140,None,0,0,0,1\n',
'141,Joke,0,0,1,0\n',
'142,None,0,0,0,1\n',
'143,Ad,0,1,0,0\n',
'144,None,1,0,0,1\n',
'145,Joke,0,0,1,0\n',
'146,Ad,0,1,0,0\n',
'147,Ad,0,1,0,0\n',
'148,Ad,0,1,0,0\n',
'149,Joke,1,0,1,0\n',
'150,Ad,1,1,0,0\n',
'151,Joke,1,0,1,0\n',
'152,None,0,0,0,1\n',
'153,Ad,0,1,0,0\n',
'154,None,0,0,0,1\n',
'155,None,0,0,0,1\n',
'156,Ad,0,1,0,0\n',
'157,Ad,0,1,0,0\n',
'158,Joke,0,0,1,0\n',
'159,None,0,0,0,1\n',
'160,Joke,1,0,1,0\n',
'161,None,1,0,0,1\n',
'162,Ad,1,1,0,0\n',
'163,Joke,0,0,1,0\n',
'164,Joke,0,0,1,0\n',
'165,Ad,0,1,0,0\n',
'166,Joke,1,0,1,0\n',
'167,Joke,1,0,1,0\n',
'168,Ad,0,1,0,0\n',
'169,Joke,1,0,1,0\n',
'170,Joke,0,0,1,0\n',
'171,Ad,0,1,0,0\n',
'172,Joke,0,0,1,0\n',
'173,Joke,0,0,1,0\n',
'174,Ad,0,1,0,0\n',
'175,None,0,0,0,1\n',
'176,Joke,1,0,1,0\n',
'177,Ad,0,1,0,0\n',
'178,Joke,0,0,1,0\n',
'179,Joke,0,0,1,0\n',
'180,None,0,0,0,1\n',
'181,None,0,0,0,1\n',
'182,Ad,0,1,0,0\n',
'183,None,0,0,0,1\n',
'184,None,0,0,0,1\n',
'185,None,0,0,0,1\n',
'186,None,0,0,0,1\n',
'187,Ad,0,1,0,0\n',
'188,None,1,0,0,1\n',
'189,Ad,0,1,0,0\n',
'190,Ad,0,1,0,0\n',
'191,Ad,0,1,0,0\n',
'192,Joke,1,0,1,0\n',
'193,Joke,0,0,1,0\n',
'194,Ad,0,1,0,0\n',
'195,None,0,0,0,1\n',
'196,Joke,1,0,1,0\n',
'197,Joke,0,0,1,0\n',
'198,Joke,1,0,1,0\n',
'199,Ad,0,1,0,0\n',
'200,None,0,0,0,1\n',
'201,Joke,1,0,1,0\n',
'202,Joke,0,0,1,0\n',
'203,Joke,0,0,1,0\n',
'204,Ad,0,1,0,0\n',
'205,None,0,0,0,1\n',
'206,Ad,0,1,0,0\n',
'207,Ad,0,1,0,0\n',
'208,Joke,0,0,1,0\n',
'209,Ad,0,1,0,0\n',
'210,Joke,0,0,1,0\n',
'211,None,0,0,0,1\n']
I'm currently trying to find the Total number of entries of the specified card type and the Percentage of tips given for the specified card type with two decimal places of precision. The tip column is the 0 or 1 right after the card type (None, Ad, Joke).
if you are allowed with pandas library then
import pandas as pd
df = pd.read_csv("TipJoke.csv")
df is a pandas dataframe object in which you can perform multiple filtering task according to your need.
for example if you want to get data for Joke you can filter like this:
print(df[df["Card"] == "Joke"])
Though, i'm just providing you the direction , not whole logic for your question.
This works
from pprint import pprint
from string import punctuation
counts = {"Joke": 0, "Ad": 0, "None": 0}
with open("TipJoke.csv", "r") as f:
for line in f:
line_clean = line.replace('"', "").replace("\n", "").split(",")
try:
counts[line_clean[1]] += int(line_clean[2])
except:
pass
print(counts)
I have a list of ips like this;
ban = ['162.210.197.58', '36.72.31.198', '163.172.221.63', '134.35.42.49', '176.65.117.81', '46.8.28.110', '110.139.16.72', '31.29.61.218', '38.95.109.67', '31.155.204.143', '5.254.65.190', '46.101.127.145', '111.15.193.84', '187.142.198.151', '188.227.45.110', '125.236.168.242', '5.254.65.122', '5.254.65.28', '185.182.81.93', '188.210.133.199', '151.241.125.237', '207.148.111.126', '145.239.70.231', '2.94.113.65', '204.85.191.30', '2.229.128.135', '92.127.255.182', '46.133.77.143', '118.69.63.15', '46.39.55.162', '185.182.81.14', '188.244.132.49', '138.197.160.121', '176.15.222.56', '82.178.233.175', '94.41.43.252', '78.137.73.171', '5.142.90.22', '212.2.212.151', '207.244.77.174', '176.59.46.108', '179.7.106.194', '154.118.17.57', '5.254.65.119', '176.33.236.247', '41.218.215.25', '199.249.224.47', '207.244.83.97', '84.241.13.93', '197.210.54.218', '93.35.166.148', '218.107.49.9', '39.55.130.71', '78.168.207.252', '213.141.227.238', '36.69.70.20', '5.254.65.218', '103.10.197.12', '207.244.83.194', '122.163.128.154', '93.185.16.82', '95.24.168.216', '103.16.26.148', '118.100.59.199', '209.59.188.147', '207.244.93.234', '88.81.46.212', '23.105.140.146', '89.104.104.225', '83.149.21.13', '41.34.51.195', '146.185.133.214', '37.204.108.198', '188.123.231.194', '85.140.3.172', '185.182.81.45', '185.182.81.61', '5.254.65.213', '185.182.81.136', '103.10.197.205', '185.182.81.86', '207.244.79.138', '116.240.84.42', '91.219.101.68', '80.191.221.23', '178.62.220.63', '209.58.148.85', '167.114.159.186', '108.59.8.246', '74.82.17.84', '178.129.172.216', '36.72.178.137', '41.254.7.157', '185.190.241.83', '185.182.81.58', '107.77.209.79', '207.244.77.137', '58.27.54.130', '69.15.118.106', '85.94.0.117', '41.215.173.28', '185.182.81.28', '46.101.18.209', '5.254.65.185', '195.123.217.178', '2.48.190.213', '188.247.190.79', '192.119.160.190', '95.72.13.156', '68.149.146.193', '197.157.244.206', '103.10.197.157', '176.195.155.114', '194.88.107.55', '83.220.237.120', '185.182.81.53', '82.102.21.114', '84.241.6.212', '207.244.78.48', '125.238.220.170', '159.8.170.20', '176.15.60.114', '46.138.146.207', '128.199.201.63', '5.254.65.179', '45.242.236.151', '188.162.177.199', '186.68.202.84', '5.202.179.40', '41.215.173.10', '103.255.6.80', '83.102.219.15', '188.32.127.17', '185.182.81.89', '207.244.86.235', '212.252.141.6', '37.147.79.146', '176.59.80.125', '212.71.255.8', '87.240.26.58', '91.79.215.243', '185.182.81.6', '207.244.83.102', '185.182.81.20', '109.133.114.88', '199.249.223.78', '37.110.94.227', '185.182.81.42', '109.252.76.194', '167.99.46.145', '95.221.186.30', '185.48.188.131', '2.191.41.213', '82.113.106.115', '178.62.102.23', '197.255.118.226', '185.182.81.12', '41.207.5.116', '68.133.80.56', '103.255.7.6', '162.210.197.53', '138.197.174.120', '185.182.81.40', '154.124.87.182', '73.218.73.13', '185.182.81.66', '139.5.231.112', '185.69.124.60', '93.190.177.36', '207.244.79.136', '66.160.199.70', '78.106.211.173', '121.52.156.4', '109.201.133.100', '78.85.40.39', '91.79.162.72', '83.242.238.138', '5.254.65.125', '5.254.65.175', '103.214.2.4', '37.232.192.102', '61.5.58.240', '41.190.14.40', '146.185.157.250', '195.123.225.47', '103.60.210.61', '85.102.111.101', '77.30.202.151', '194.146.149.46', '95.27.45.39', '146.185.158.24', '207.244.86.240', '195.175.45.154', '195.149.108.48', '95.12.119.124', '82.102.31.203', '156.221.118.172', '78.108.178.87', '185.182.81.37', '41.219.31.245', '191.43.27.200', '142.44.210.125', '5.254.65.169', '46.196.145.191', '68.133.80.55', '5.254.65.127', '68.133.80.58', '31.173.84.201', '39.50.173.251', '46.242.10.175', '201.208.20.239', '174.16.34.27', '65.19.167.132', '182.0.139.135', '128.68.185.234', '78.178.85.114', '182.182.91.44', '103.10.197.155', '199.249.223.77', '46.173.75.34', '197.32.29.188', '185.18.46.170', '107.181.182.227', '105.50.114.109', '199.87.154.255', '185.182.81.23', '5.115.181.178', '67.205.137.158', '46.73.40.13', '117.224.14.73', '155.239.150.15', '98.253.130.3', '122.129.77.114', '176.194.163.88', '109.63.235.0', '85.140.7.208', '207.244.82.229', '94.29.124.69', '188.245.245.146', '78.177.102.153', '197.210.173.174', '85.174.236.178', '82.247.106.95', '207.244.83.218', '162.210.197.55', '185.180.197.65', '31.8.130.8', '114.124.135.2', '46.172.203.108', '185.182.81.41', '199.115.116.37', '207.244.78.175', '207.244.77.139', '185.182.81.85', '172.56.30.213', '79.126.114.90', '5.254.65.114', '207.244.77.9', '103.10.197.202', '87.245.249.103', '185.182.81.48', '39.54.218.203', '109.169.172.138', '39.50.16.30', '89.104.104.227', '78.189.31.17', '207.244.77.19', '95.28.26.67', '39.54.6.166', '145.255.169.92', '176.43.253.71', '185.107.70.202', '65.19.167.131', '199.244.86.149', '197.156.241.249', '114.125.184.172', '89.178.171.173', '151.235.84.235', '185.104.184.120', '197.243.40.75', '88.253.119.59', '103.76.21.243', '207.244.78.50', '95.73.121.206', '185.182.81.60', '176.55.167.228', '179.54.147.13', '156.198.95.108', '199.249.223.63', '165.139.150.133', '185.182.81.16', '185.182.81.80', '162.210.197.54', '41.34.8.12', '86.161.166.126', '103.21.125.80', '5.254.65.15', '65.49.126.73', '69.117.214.158', '92.96.162.204', '64.62.232.66', '2.48.180.82', '213.87.163.155', '108.59.8.217', '91.151.188.194', '197.229.0.32', '76.218.238.42', '178.62.6.233', '159.65.117.243', '217.118.78.80', '109.252.25.1', '96.80.89.69', '89.40.118.87', '185.182.81.25', '73.43.73.161', '108.59.10.184', '5.155.214.63', '103.10.197.195', '95.71.21.143', '113.21.99.50', '198.96.155.3', '203.156.158.201', '5.254.65.19', '103.10.197.133', '207.244.94.5', '190.39.1.149', '103.10.197.61', '213.87.139.175', '91.244.39.229', '136.0.99.204', '77.34.114.167', '185.76.248.253', '185.182.81.139', '207.244.82.154', '199.115.115.194', '139.59.84.23', '209.126.90.78', '156.219.221.191', '109.252.99.3', '67.252.57.173', '222.70.120.196', '207.244.78.47', '156.197.120.27', '185.182.81.76', '78.95.166.34', '103.10.197.131', '37.21.231.95', '87.225.68.121', '46.40.227.14', '77.43.173.56', '114.124.181.244', '37.1.129.188', '128.72.83.20', '5.254.65.85', '31.129.204.120', '5.158.237.18', '151.243.199.70', '185.182.81.77', '103.228.157.94', '69.202.142.68', '94.25.180.120', '185.182.81.91', '103.10.197.154', '202.22.227.26', '49.145.151.70', '207.244.86.198', '185.182.81.19', '185.182.81.69', '5.254.65.123', '65.49.68.203', '103.79.155.58', '216.185.36.99', '80.191.221.21', '5.254.65.220', '91.215.69.212', '5.254.65.24', '103.10.197.204', '105.67.7.89', '195.112.116.163', '46.42.167.129', '190.142.101.84', '207.244.79.152', '5.59.53.95', '14.139.125.241', '197.211.63.74', '195.190.107.166', '185.182.81.135', '194.146.181.131', '83.220.237.252', '41.215.173.33', '142.44.210.206', '62.213.109.218', '199.168.151.35', '5.254.65.217', '45.242.236.74', '80.253.23.149', '185.182.81.11', '93.168.185.76', '185.182.81.82', '77.106.62.60', '185.220.101.20', '185.182.81.47', '185.182.81.56', '5.41.65.151', '66.160.188.43', '74.82.35.71', '79.139.176.23', '88.203.110.34', '64.147.94.201', '213.87.132.49', '91.216.114.201', '179.53.157.148', '185.182.81.4', '196.74.129.52', '95.72.247.249', '185.182.81.71', '185.182.81.138', '154.68.60.100', '81.214.142.169', '147.110.59.244', '200.2.187.239', '68.133.80.57', '5.44.174.110', '142.234.201.129', '89.238.177.226', '120.188.86.112', '207.244.78.173', '173.208.82.162', '115.178.200.218', '139.162.233.190', '43.245.8.67', '103.10.197.58', '207.244.83.219', '148.0.219.242', '5.126.66.228', '37.190.37.137', '182.179.179.160', '103.10.197.220', '41.66.255.100', '37.144.44.196', '207.244.77.18', '5.254.65.102', '213.33.193.34', '185.182.81.51', '154.121.5.242', '92.246.191.120', '186.167.242.158', '217.117.66.54', '85.88.176.70', '114.124.182.15', '146.185.177.103', '179.180.67.5', '207.154.232.214', '91.192.70.25', '156.38.35.15', '147.135.254.228', '103.10.197.130', '195.214.140.20', '31.29.217.147', '73.170.68.18', '142.234.201.141', '2.191.76.162', '151.80.36.117', '5.254.65.195', '213.87.225.138', '5.0.249.218', '27.68.68.184', '178.59.189.246', '95.30.31.106', '213.87.126.137', '104.45.14.157', '196.6.216.5', '185.182.81.131', '5.254.65.177', '103.255.7.21', '212.75.159.112', '103.10.197.196', '217.55.47.233', '109.252.76.112', '36.84.62.52', '185.253.97.52', '31.148.253.33', '185.182.81.78', '159.224.41.242', '46.101.16.232', '5.254.65.97', '89.160.136.97', '207.244.78.8', '5.254.65.82', '185.182.81.92', '95.29.19.127', '185.182.81.22', '66.160.201.58', '93.90.204.172', '2.93.201.96', '182.1.178.122', '148.0.142.65', '94.233.224.122', '163.232.200.115', '200.236.75.164', '106.204.119.26', '168.8.192.22', '185.182.81.83', '107.77.209.106', '207.244.78.17', '89.104.104.224', '106.51.234.22', '71.12.167.41', '188.247.109.2', '213.87.156.209', '192.240.119.59', '78.180.74.219', '37.107.111.54', '103.10.197.132', '5.254.65.8', '105.167.52.122', '146.185.157.173', '95.184.110.95', '23.101.10.145', '5.254.65.124', '151.231.40.224', '178.129.222.227', '185.182.81.68', '185.107.47.215', '146.185.156.139', '120.188.39.56', '176.14.163.226', '178.62.113.166', '95.153.134.122', '185.182.81.75', '156.194.101.98', '185.182.81.72', '182.187.97.214', '41.72.194.58', '196.201.199.210', '185.182.81.79', '51.39.237.115', '202.146.235.80', '176.123.126.254', '86.149.9.202', '41.254.4.43', '91.240.73.248', '185.88.157.231', '171.229.95.88', '5.254.65.129', '139.59.225.74', '95.24.175.100', '5.254.65.182', '185.182.81.50', '89.31.57.5', '129.56.11.18', '207.244.83.104', '199.115.118.85', '89.235.123.202', '154.69.67.5', '36.84.62.49', '146.185.157.38', '31.29.248.93', '109.252.20.82', '207.244.94.36', '66.160.188.67', '5.254.65.6', '38.95.109.110', '178.175.132.67', '103.214.2.35', '196.24.72.24', '185.182.81.9', '109.252.177.102', '136.252.163.99', '185.104.184.116', '162.247.72.199', '213.55.105.3', '5.79.68.161', '188.170.103.189', '5.254.65.21', '88.228.167.196', '41.242.138.49', '103.10.197.59', '178.73.220.45', '200.82.249.248', '5.254.65.107', '207.244.77.172', '5.254.65.84', '37.190.37.197', '213.87.131.60', '37.112.229.42', '95.24.91.228', '185.182.81.84', '77.247.181.162', '78.81.231.143', '185.182.81.46', '207.244.83.114', '41.253.39.240', '92.50.223.44', '173.208.82.164', '41.85.161.178', '213.87.122.199', '37.52.183.88', '207.244.72.201', '109.252.53.106', '108.59.8.208', '104.129.18.10', '122.57.37.39', '176.59.48.37', '125.178.76.189', '154.130.48.206', '113.169.71.251', '91.215.69.23', '54.36.105.8', '95.10.207.189', '128.31.0.13', '81.211.123.74', '103.10.197.13', '46.242.10.245', '81.131.164.246', '156.194.244.214', '178.62.106.169', '204.85.191.31', '207.244.86.200', '77.27.69.166', '192.96.205.131', '139.59.3.30', '80.252.16.20', '185.182.81.137', '150.100.253.6', '103.25.72.234', '207.244.86.195', '154.160.16.103', '78.95.6.2', '185.182.81.44', '197.248.213.18', '202.142.79.94', '38.95.108.247', '2.135.158.83', '37.73.82.56', '103.255.6.75', '5.254.65.110', '64.62.175.68', '207.244.83.209', '172.241.114.173', '207.244.79.137', '207.244.82.251', '207.244.86.232', '94.233.224.26', '174.141.115.146', '109.174.112.128', '93.91.80.6', '41.190.3.77', '159.89.115.182', '188.163.3.226', '95.191.226.57', '139.5.154.162', '193.0.219.72', '185.182.81.70', '138.197.174.51', '41.217.109.25', '39.50.91.154', '37.204.198.46', '130.180.218.30', '5.254.65.17', '138.197.160.42', '5.254.65.187', '37.195.213.64', '188.170.73.65', '197.48.231.108', '185.189.113.42', '185.182.81.74', '41.66.203.243', '163.21.7.42', '5.254.65.83', '157.50.236.233', '94.233.224.7', '128.71.22.17', '130.34.246.190', '185.52.141.42', '139.59.84.2', '84.241.202.178', '123.242.160.17', '74.82.17.52', '182.0.205.139', '203.202.229.194', '89.238.191.150', '79.165.225.194', '73.61.8.75', '41.254.8.167', '185.182.81.5', '94.47.16.26', '182.186.238.234', '156.216.204.96', '109.252.57.81', '180.252.104.52', '79.148.109.242', '66.160.201.47', '2.191.110.236', '207.244.77.13', '5.254.65.98', '145.255.166.214', '159.203.130.12', '197.234.221.77', '159.89.206.211', '85.141.69.27', '185.182.81.90', '77.31.101.152', '202.22.237.40', '95.221.205.237', '207.244.77.55', '104.236.233.75', '5.254.65.104', '104.236.233.182', '103.10.197.60', '103.10.197.222', '182.54.141.154', '138.197.174.79', '176.33.136.50', '185.182.81.10', '185.220.101.30', '94.138.40.2', '41.44.178.2', '196.207.121.147', '213.138.86.113', '31.173.83.65', '199.115.115.197', '46.72.89.106', '103.240.170.209', '5.254.65.215', '146.185.171.157', '185.182.81.57', '188.119.59.75', '46.164.104.176', '176.52.96.95', '207.244.83.99', '31.43.31.116', '178.62.208.24', '5.254.65.167', '185.182.81.73', '104.194.24.250', '79.200.158.58', '79.111.52.85', '103.10.197.91', '199.249.224.65', '5.136.175.48', '111.119.165.253', '216.218.222.12', '191.248.209.198', '5.254.65.81', '178.46.97.50', '27.97.248.46', '190.72.126.220', '178.140.204.131', '5.114.82.81', '118.175.154.120', '207.244.90.69', '185.182.81.7', '103.17.88.247', '108.248.141.54', '130.34.246.189', '207.244.79.154', '41.66.255.247', '128.69.14.62', '87.107.208.172', '199.115.116.35', '46.191.213.221', '199.115.114.220', '172.56.44.234', '69.12.94.91', '5.254.65.99', '37.203.243.97', '185.182.81.29', '37.210.142.211', '207.244.78.9', '206.189.56.39', '193.104.27.86', '37.190.37.236', '112.197.34.56', '189.124.82.209', '99.66.132.171', '5.254.65.16', '46.201.114.83', '81.31.178.23', '94.47.97.98', '82.102.31.251', '103.10.197.203', '94.98.252.208', '14.139.160.236', '185.182.81.130', '184.43.6.218', '92.37.143.180', '5.254.65.12', '83.142.185.126', '5.254.65.178', '213.87.123.37', '36.67.214.132', '185.182.81.21', '205.164.32.216', '207.244.83.198', '5.254.65.11', '197.210.221.42', '207.244.83.206', '5.254.65.130', '176.14.156.125', '94.138.40.7', '182.75.59.110', '162.210.197.57', '41.35.98.90', '95.139.11.104', '103.10.197.93', '39.40.168.95', '213.87.127.74', '2.94.25.98', '216.218.222.14', '207.244.82.155', '138.197.154.118', '199.249.223.69', '5.254.65.112', '62.120.19.197', '185.182.81.17', '148.0.140.9', '103.10.197.197', '185.182.81.15', '217.114.239.64', '185.182.81.134', '130.180.217.17', '111.88.82.111', '112.210.165.48', '81.92.200.142', '185.182.81.43', '207.244.77.3', '169.159.85.133', '5.254.65.20', '197.234.221.34', '192.162.242.11', '185.182.81.67', '103.247.48.9', '109.173.73.174', '171.25.193.20', '207.244.77.17', '176.59.44.162', '95.28.48.169', '5.254.65.113', '62.112.114.74', '5.254.65.174', '179.96.223.197', '149.202.202.78', '146.185.157.243', '110.93.216.238', '217.148.214.162', '23.100.15.206', '24.133.136.29', '59.189.57.53', '185.182.81.62', '31.181.90.52', '5.254.65.117', '94.99.170.144', '146.185.157.164', '178.62.230.201', '41.223.117.58', '197.54.177.15', '212.252.67.30', '109.252.86.175', '194.146.149.39', '207.244.83.197', '190.94.3.236', '207.244.94.9', '5.254.65.193', '5.254.65.194', '207.244.77.10', '146.185.158.9', '5.254.65.172', '37.156.22.79', '209.190.17.22', '103.10.197.156', '5.254.65.9', '178.140.62.96', '207.244.77.16', '109.161.121.117', '37.78.31.88', '185.220.101.13', '185.182.81.87', '87.107.165.5', '185.135.233.14', '209.58.148.100', '83.110.13.200', '5.254.65.132', '93.80.234.243', '190.39.198.140', '95.221.183.242', '185.222.209.32', '154.160.16.167', '80.71.249.236', '185.182.81.39', '41.79.197.12', '91.76.89.93', '45.247.206.86', '185.182.81.88', '192.81.222.95', '207.244.86.236', '197.210.29.171', '213.61.95.74', '41.254.7.18', '31.9.55.66', '154.112.8.183', '5.254.65.180', '37.187.158.97', '195.149.108.45', '94.179.147.128', '94.130.206.211', '31.173.85.237', '103.10.197.221', '46.39.83.173', '5.254.65.170', '5.254.65.18', '95.167.31.202', '207.244.83.212', '138.197.174.132', '185.182.81.35', '192.96.205.135', '130.88.240.92', '185.182.81.94', '90.147.2.94', '209.126.90.97', '128.75.107.209', '2.94.139.161', '103.106.174.102', '186.4.212.26', '37.145.142.85', '207.244.86.194', '72.211.60.177', '95.30.45.134', '80.71.253.200', '5.254.65.109', '185.182.81.38', '5.254.65.214', '178.62.221.111', '81.92.200.204', '89.104.104.226', '95.73.236.188', '207.244.83.113', '185.182.81.24', '103.247.48.111', '176.59.43.149', '109.233.172.188', '207.244.70.35', '84.17.230.179', '83.123.148.16', '122.171.168.163', '213.87.121.217', '110.54.232.113', '206.189.56.10', '154.160.30.129', '207.244.93.227', '178.46.96.130', '87.116.196.230', '181.208.250.104', '43.252.233.33', '207.244.82.249', '154.68.5.60', '37.27.72.154', '178.68.165.3', '194.186.9.228', '207.244.79.134', '41.34.209.13', '192.119.160.242', '37.187.129.166', '41.254.7.152', '46.38.23.226', '185.182.81.52', '27.34.104.88', '31.29.253.114', '200.17.141.3', '86.62.75.82', '93.183.219.225', '94.143.40.157', '103.10.197.194', '188.166.212.116', '185.182.81.2', '212.98.135.186', '103.10.197.92', '207.244.83.195', '217.64.17.124', '46.242.10.244', '209.58.136.236', '156.203.226.215', '103.10.197.218', '199.115.114.218', '185.182.81.8', '207.244.82.231', '95.8.0.20', '207.244.82.133', '128.74.124.28', '111.95.14.210', '213.87.126.35', '5.254.65.134', '91.240.210.185', '185.182.81.132', '95.32.139.134', '213.55.110.157', '185.182.81.26', '176.47.67.182', '91.65.54.25', '217.10.34.243', '185.182.81.133', '182.1.191.230', '103.10.197.90', '213.87.225.153', '207.244.77.184', '185.182.81.81', '5.254.65.88', '123.176.6.58', '213.55.110.206', '93.157.169.12', '103.10.197.219', '109.252.81.16', '78.111.187.63', '82.196.1.179', '185.182.81.59', '148.0.193.199', '14.202.100.66', '5.250.37.110', '156.219.245.39', '31.9.112.197', '185.112.37.25', '128.70.176.120', '185.182.81.27', '188.226.226.40', '109.174.113.45', '5.246.33.162', '5.254.65.189', '46.151.246.57', '77.71.24.206', '178.62.36.218', '185.182.81.36', '196.188.112.153', '199.249.223.49', '80.252.153.74', '87.109.169.79', '207.244.83.200', '197.210.54.144', '95.57.231.156', '207.244.94.33', '103.16.25.2', '217.174.233.46', '178.184.98.116', '185.182.81.55', '107.77.207.197', '86.21.86.205', '5.254.65.192', '38.95.108.230', '94.13.131.173', '107.77.209.181', '176.59.54.71', '128.179.252.54', '31.215.58.51', '207.244.83.123', '85.106.69.110', '192.160.102.170', '37.146.34.208', '46.39.228.226', '87.225.75.186', '159.89.123.60', '176.59.68.249', '37.143.20.94', '105.112.29.37', '89.146.74.31', '90.147.2.86', '217.144.175.138', '212.252.57.203', '131.117.162.145', '109.63.245.141', '92.97.199.104', '176.123.121.221', '213.142.154.3', '180.160.55.195', '176.195.133.16', '172.241.112.3', '207.244.83.217', '125.167.120.44', '109.252.85.9', '46.39.53.115', '206.189.201.4', '31.173.83.78', '199.115.115.215', '178.62.153.200']
How do I convert a list of ips like above into a list of subnets?
For example;
185.182.0.0/y, 207.244.0.0/x
Edit
I'm getting a lot of malicious traffic from several ips that look like they're in the same subnet, however there are a few thousand ips, how do I convert a list like this into subnets so that I can block the subnet?
Im guessing subnets are 000.000.XXX.XXX where the X's can be a single 0?
Lets assume you have:
ban = ['162.210.197.58/32', '36.72.31.198/34', ..., '134.35.42.49/20', '176.65.117.81/32']
Use a set comprehension to ensure you get only unique elements:
subnets = {'.'.join(addr.split('.')[:2]) + '.0.0/' + addr.split('/')[-1] for addr in ban}
And you get:
>>> subnets
{'162.210.0.0/32', '36.72.0.0/34', ..., '134.35.0.0/20'}
Note that an IPv4 subnet may be many different sizes, from /32 for a single IP address to /0 for the whole internet. You don't have enough information to work out how a network administrator has subnetted their network if the only thing you have is an IP address. Assuming a /24 or a /16 will often be wrong.
That said, you could use whois to lookup the net ranges associated with the ASNs which contain particular IP addresses. You could use whois from python via a tool like ipwhois to build up a list of net ranges that you consider "bad" based on the fact that you've had a contact from a bad IP in that ASN.
from ipwhois import IPWhois
bad_ips = ["185.182.81.12",]
bad_asn_ranges = set()
for ip in bad_ips:
obj = IPWhois(ip)
ret = obj.lookup_whois()
print(ret)
bad_asn_ranges.add(ret['asn_cidr'])
print(bad_asn_ranges)
This would have unintended consequences if good people and bad people both coexist on the same ASN as one another.
If IP addresses are stored as a list and are in a string type, here is one very simple solution to reseting all subnets and then at the same time leave duplicates in another condition. Maybe slow code, but functional code.
data = ['162.210.197.58', '36.72.121.146', '36.72.121.4', '162.210.108.4', '190.5.140.90']
tmp = []
for ip in data:
ip1, ip2, ip3, ip4 = ip.split('.')
if (ip1 + '.' + ip2 + '.0.0') not in tmp:
tmp.append(ip1 + '.' + ip2 + '.0.0')
data = tmp
print(data)
Another example:
data = ['162.210.197.58', '36.72.121.146', '36.72.121.4', '162.210.108.4', '190.5.140.90']
msk = 1 # range 1-3, how much bytes do you want set to zero, from right side (in IP address)
tmp = []
for ip in data:
s = '.'.join(ip.split('.')[:-msk]) + ('.0'*msk) # split the string by '.' delimiter, then combine all to back but without msk entries from the right side, then add string '.0' repeated msk times
if s not in tmp:
tmp.append(s)
data = tmp
print(data)
A note:
It is better to keep IP addresses as an integer data type (as Python does not know the Byte data type). Then any IP address operations will be really simple and fast. Logical operations NOR, XOR, OR, AND, etc. (as needed), you could do everything you need in a professional area of computer networks using a mask. Of course, it depends on the length of the addressable IP address field - as the colleagues mentioned above. If the length of the standard most frequently used is 16 bits (2 bytes) or 24 bits (3 bytes), then my code is sufficient. However, if the length in the bits is non-standard (26 bits, 12 bits, 10 bits, etc.), then it is better to set the integer type from the string IP address and further process the IP addresses already as IPv4 length estimates, which are 4 bytes operations XOR, AND, OR, ...).
This will give you the 24 bit prefixes and number of occurrences. defaultdict is used to automatically insert a 0 in the dictionary when the key is missing, making the code more simple.
data = ['162.210.197.58', '36.72.31.198'] #etc....
import collections
subnets = collections.defaultdict(int)
for ip in data:
subnets[".".join(ip.split('.')[0:3])] += 1
print(subnets)
This answer by Wang is the correct way to do this: https://stackoverflow.com/a/45503187/3176550
list_of_ips = ['10.0.0.0', '10.0.0.1', '10.0.0.2', '10.0.0.3', '10.0.0.5']
import ipaddress
nets = [ipaddress.ip_network(_ip) for _ip in list_of_ips]
cidrs = ipaddress.collapse_addresses(nets)
list(cidrs)
Out[6]: [IPv4Network('10.0.0.0/30'), IPv4Network('10.0.0.5/32')]
I have the following data and I need to extract the first string occurrence It is separated from rest of data with \t. I'm trying to use split(),regex but the problem is it is taking more than 1 second to do this for each line. Is there anyway that it could be done faster?
Data:
DT 0.00155095460731831934 0.00121897344629313064 0.00000391325536877105 0.09743272975663436197 0.00002271067721789807 0.00614528909266214615 0.00000445295550745487 0.70422975214810612510 0.00000042521183266708 0.00080380970031485965 0.00046229528280753270 0.00019894095277762626 0.00041012830368947716 0.00013156663380611624 0.00000001065986007929 0.00004244196517011733 0.00061444160944146384 0.02101761386512242258 0.00010328516871273944 0.00001128873771536226 0.00279163054567377073 0.00018903663417650421 0.00006490063677390687 0.00002151218889856898 0.00032824534915777535 0.00040349658620449016 0.00042393411014689220 0.00053643791028589382 0.00001032961180051124 0.00025743865541833909 0.00011497457801324625 0.00005359814320647386 0.00010336445810407512 0.00040942464084107332 0.00009098970100047888 0.00000091369931486168 0.00059479547081431436 0.00000009853464391239 0.00020303484015768289 0.00050594563648307127 0.15679657927655424321 0.00034115929559768240 0.00115490132012489345 0.00019823414624750937
PRP 0.00000131203717608417 0.99998368311809904263 0.00000002192874737415 0.00000073240710142655 0.00000000536610432900 0.00000195554704853124 0.00000000012203475361 0.00000017206852489982 0.00000040268728691384 0.00000034167449501884 0.00000077203219019333 0.00000003082351874675 0.00000052849070550174 0.00000319144710228690 0.00000000009512989203 0.00000002016363199180 0.00000005598551431381 0.00000129166108708107 0.00000004127954869435 0.00000099983230311242 0.00000032415702089502 0.00000010477525952469 0.00000000011045642123 0.00000006942075882668 0.00000017433924380308 0.00000028874823360049 0.00000048656924101513 0.00000017722073116061 0.00000037193481161874 0.00000000452174124394 0.00000081986547018432 0.00000001740977711224 0.00000000808377988046 0.00000001418892143074 0.00000045250939471023 0.00000000000050232556 0.00000043504206149021 0.00000011310292804313 0.00000000013241046549 0.00000015302998639348 0.00000002800056509608 0.00000038361859715043 0.00000000099713364069 0.00000001345362455494
VBD 0.00000002905639670475 0.00000000730896486886 0.00000000406530491040 0.00000009048972500851 0.00000000380338117015 0.00000000000390031394 0.00000000169948197867 0.00000000091890304843 0.00000000013856552537 0.00000191013917141413 0.00000002300239228881 0.00000003601993413087 0.00000004266629173115 0.00000000166497478879 0.00000000000079281873 0.00000180895378547175 0.00000000000159251758 0.00000000081310874277 0.00000000334322892919 0.99999591744268101490 0.00000000000454647012 0.00000000060884665646 0.00000000000010515727 0.00000000019245471748 0.00000000308524019147 0.00000001376847404364 0.00000001449670334202 0.00000001434634011983 0.00000000656887521298 0.00000000796791556475 0.00000000578334901413 0.00000000142124935798 0.00000000213053365838 0.00000000487780229311 0.00000001702409705978 0.00000000391793832836 0.00000001292779157438 0.00000000002447935587 0.00000000000435117453 0.00000000408872313468 0.00000000007201124397 0.00000000431736839121 0.00000000002970930698 0.00000000080852330796
RB 0.00000015663242474016 0.00000002464350694082 0.00000000095443410385 0.99998778106321006831 0.00000000021007124986 0.00000006156902517681 0.00000000277279124155 0.00000000301727284928 0.00000000030682776953 0.00000007379165980724 0.00000012399749754355 0.00000494600825959811 0.00000008488215978963 0.00000000897527112360 0.00000000000009257081 0.00000000223574222125 0.00000000371653801739 0.00000548300954899374 0.00000001802212638276 0.00000000022437343140 0.00000001084514551630 0.00000000328207000562 0.00000000672649111321 0.00000003640165688536 0.00000050812474700731 0.00000007422081603379 0.00000018000760320187 0.00000007733588104368 0.00000008890139839523 0.00000001494850369145 0.00000003233439691280 0.00000000299507821025 0.00000000501198681017 0.00000000271863832841 0.00000004782796496077 0.00000000000160157399 0.00000006968900381578 0.00000000003199719817 0.00000001234122837743 0.00000002204081342858 0.00000000038818632144 0.00000002327335651712 0.00000000016015202564 0.00000000435845392228
VBN 0.00222925562857408935 0.00055631931823257885 0.00000032474066230587 0.00333293927262896372 0.12594759350192680225 0.00142014631420757115 0.00008260266473343272 0.00001658664201138300 0.00000444848747905589 0.00025881226046863004 0.00176478222683846956 0.00226268536384150636 0.00120807701719786715 0.00016158429451364274 0.00000000200391980114 0.00012971908549403702 0.41488930515218963579 0.41237674095727266943 0.00025649814681915863 0.00001340291420511781 0.00067983726358035045 0.00001718712609473795 0.00009573412529081616 0.02342065200703593100 0.00010281749829896253 0.00243912549478067552 0.00111221146411718771 0.00110067534479759994 0.00048702441892562549 0.00014537544850052323 0.00046019613393571187 0.00004100416046505168 0.00001820421200359182 0.00013212194667244404 0.00112515351673182361 0.00000022002597310723 0.00099184191436586821 0.00000187809735682276 0.00000214888688830288 0.00031369371619907773 0.00000552482376141306 0.00033123576486582436 0.00000227934800338172 0.00006203126813779618
So,the bottom line is I need to extract DT, PRP, VBD... from the above text really fast.
You can just call split with maxsplit argument and wrap it into a list generator.
result = [line.split('\t', 1)[0] for line in data]
As you see, passing 1 in the method call makes it stop after the first splitting takes place. I bet this is the fastest solution in Python.
A manual alternative.
def end_of_loop():
raise StopIteration
def my_split(line):
return ''.join(end_of_loop() if char == '\t' else char for char in line)
result = [my_split(line) for line in lines]
Provided your data are in a file:
with open(file) as data:
result = [my_split(line) for line in data]
This will be a lot slower than the first one.
You can use split in a list comprehension :
>>> s="""DT 0.00155095460731831934 0.00121897344629313064 0.00000391325536877105 0.09743272975663436197 0.00002271067721789807 0.00614528909266214615 0.00000445295550745487 0.70422975214810612510 0.00000042521183266708 0.00080380970031485965 0.00046229528280753270 0.00019894095277762626 0.00041012830368947716 0.00013156663380611624 0.00000001065986007929 0.00004244196517011733 0.00061444160944146384 0.02101761386512242258 0.00010328516871273944 0.00001128873771536226 0.00279163054567377073 0.00018903663417650421 0.00006490063677390687 0.00002151218889856898 0.00032824534915777535 0.00040349658620449016 0.00042393411014689220 0.00053643791028589382 0.00001032961180051124 0.00025743865541833909 0.00011497457801324625 0.00005359814320647386 0.00010336445810407512 0.00040942464084107332 0.00009098970100047888 0.00000091369931486168 0.00059479547081431436 0.00000009853464391239 0.00020303484015768289 0.00050594563648307127 0.15679657927655424321 0.00034115929559768240 0.00115490132012489345 0.00019823414624750937
... PRP 0.00000131203717608417 0.99998368311809904263 0.00000002192874737415 0.00000073240710142655 0.00000000536610432900 0.00000195554704853124 0.00000000012203475361 0.00000017206852489982 0.00000040268728691384 0.00000034167449501884 0.00000077203219019333 0.00000003082351874675 0.00000052849070550174 0.00000319144710228690 0.00000000009512989203 0.00000002016363199180 0.00000005598551431381 0.00000129166108708107 0.00000004127954869435 0.00000099983230311242 0.00000032415702089502 0.00000010477525952469 0.00000000011045642123 0.00000006942075882668 0.00000017433924380308 0.00000028874823360049 0.00000048656924101513 0.00000017722073116061 0.00000037193481161874 0.00000000452174124394 0.00000081986547018432 0.00000001740977711224 0.00000000808377988046 0.00000001418892143074 0.00000045250939471023 0.00000000000050232556 0.00000043504206149021 0.00000011310292804313 0.00000000013241046549 0.00000015302998639348 0.00000002800056509608 0.00000038361859715043 0.00000000099713364069 0.00000001345362455494
... VBD 0.00000002905639670475 0.00000000730896486886 0.00000000406530491040 0.00000009048972500851 0.00000000380338117015 0.00000000000390031394 0.00000000169948197867 0.00000000091890304843 0.00000000013856552537 0.00000191013917141413 0.00000002300239228881 0.00000003601993413087 0.00000004266629173115 0.00000000166497478879 0.00000000000079281873 0.00000180895378547175 0.00000000000159251758 0.00000000081310874277 0.00000000334322892919 0.99999591744268101490 0.00000000000454647012 0.00000000060884665646 0.00000000000010515727 0.00000000019245471748 0.00000000308524019147 0.00000001376847404364 0.00000001449670334202 0.00000001434634011983 0.00000000656887521298 0.00000000796791556475 0.00000000578334901413 0.00000000142124935798 0.00000000213053365838 0.00000000487780229311 0.00000001702409705978 0.00000000391793832836 0.00000001292779157438 0.00000000002447935587 0.00000000000435117453 0.00000000408872313468 0.00000000007201124397 0.00000000431736839121 0.00000000002970930698 0.00000000080852330796
... RB 0.00000015663242474016 0.00000002464350694082 0.00000000095443410385 0.99998778106321006831 0.00000000021007124986 0.00000006156902517681 0.00000000277279124155 0.00000000301727284928 0.00000000030682776953 0.00000007379165980724 0.00000012399749754355 0.00000494600825959811 0.00000008488215978963 0.00000000897527112360 0.00000000000009257081 0.00000000223574222125 0.00000000371653801739 0.00000548300954899374 0.00000001802212638276 0.00000000022437343140 0.00000001084514551630 0.00000000328207000562 0.00000000672649111321 0.00000003640165688536 0.00000050812474700731 0.00000007422081603379 0.00000018000760320187 0.00000007733588104368 0.00000008890139839523 0.00000001494850369145 0.00000003233439691280 0.00000000299507821025 0.00000000501198681017 0.00000000271863832841 0.00000004782796496077 0.00000000000160157399 0.00000006968900381578 0.00000000003199719817 0.00000001234122837743 0.00000002204081342858 0.00000000038818632144 0.00000002327335651712 0.00000000016015202564 0.00000000435845392228
... VBN 0.00222925562857408935 0.00055631931823257885 0.00000032474066230587 0.00333293927262896372 0.12594759350192680225 0.00142014631420757115 0.00008260266473343272 0.00001658664201138300 0.00000444848747905589 0.00025881226046863004 0.00176478222683846956 0.00226268536384150636 0.00120807701719786715 0.00016158429451364274 0.00000000200391980114 0.00012971908549403702 0.41488930515218963579 0.41237674095727266943 0.00025649814681915863 0.00001340291420511781 0.00067983726358035045 0.00001718712609473795 0.00009573412529081616 0.02342065200703593100 0.00010281749829896253 0.00243912549478067552 0.00111221146411718771 0.00110067534479759994 0.00048702441892562549 0.00014537544850052323 0.00046019613393571187 0.00004100416046505168 0.00001820421200359182 0.00013212194667244404 0.00112515351673182361 0.00000022002597310723 0.00099184191436586821 0.00000187809735682276 0.00000214888688830288 0.00031369371619907773 0.00000552482376141306 0.00033123576486582436 0.00000227934800338172 0.00006203126813779618"""
>>> [i.split()[0] for i in s.split('\n')]
['DT', 'PRP', 'VBD', 'RB', 'VBN']
import re
p = re.compile(r'^\S+', re.MULTILINE)
re.findall(p, test_str)
You can simply do this to get a list of strings you want.
I would like to choose a random seed for numpy.random and save it to a variable. I can set the seed using numpy.random.seed(seed=None) but how do you get numpy to choose a random seed and tell you what it is?
Number seems to use /dev/urandom on linux by default.
The full state of the MT19937 PRNG that underlies RandomState cannot be contained in a single (normally-sized, e.g. 32-bit or 64-bit) integer. It has an array of 624 32-bit integers for its state. Seeding with an integer actually runs a smaller, simpler PRNG to generate those 624 words. It is just a convenient way for humans to manually set the state of the PRNG to a state that can be consistently replicated. But most states that the PRNG gets into cannot be reduced back to a convenient 32-bit integer. That initializer PRNG cannot work "backwards" in this way. Instead, the whole state of RandomState is contained in that 624-entry array. You can get this array and set it using the get_state() and set_state() methods.
>>> import numpy as np
>>> prng = np.random.RandomState()
>>> state = prng.get_state()
>>> state
('MT19937',
array([2310623686, 364919541, 1436109096, 1457837701, 2852017530, 562204638, 1207376362, 2290452263, 250624867, 1687514807, 3242300311, 68301227,
497650124, 3782308076, 4180165271, 3190969185, 1284472452, 2868357773, 1148940887, 433865334, 643839653, 3091921054, 2157305915, 4079505239,
1396964105, 221256094, 2789328727, 3216471912, 1782932723, 1704818545, 3880597634, 2060476197, 2599008138, 1389874875, 56765165, 1173841349,
278528026, 714062321, 3587382791, 840507318, 2086996355, 3416087866, 3081938567, 946222923, 4259369972, 868558506, 2060774692, 3239317074,
4078800142, 3833877854, 1503749328, 3821805560, 1447854235, 995535877, 3762179650, 185008825, 149218213, 3469766149, 803379340, 3971043961,
3421104633, 2287066419, 2465098532, 4088166586, 2105722956, 1451099732, 3115885598, 4240224392, 3778829453, 4059831750, 2919989511, 4092928731,
922778621, 1805422791, 3344418665, 1738799711, 1367565729, 34977430, 4008589298, 2239856842, 1717530303, 32049105, 3468621644, 2269299060,
1664083607, 3996022881, 377407365, 4070209212, 4216115381, 2124999225, 1920630572, 2011423407, 1367187092, 4158622494, 487432561, 3536187733,
931951977, 749985693, 2812437433, 3902171864, 767004922, 3807520852, 796884475, 2794577773, 1481140267, 2247603372, 1053872430, 211335743,
2997489007, 4140013480, 1601875594, 1927437737, 3349007801, 2868575676, 3474179396, 595650352, 517981041, 1947095736, 170970294, 3253183597,
2873789192, 3386930182, 2047755893, 254974719, 2747566023, 4182212825, 1934990158, 1282861435, 404005052, 3237256048, 1737335951, 386655885,
640537519, 60176882, 1825713593, 86537970, 252007523, 3674897989, 3645447766, 972417578, 1860821974, 2688102651, 2481103756, 3672142036,
2961031222, 1709451377, 134371222, 4217784577, 3792528752, 1278543741, 291978547, 1987232116, 2685749450, 948431490, 3550698848, 1384058130,
302186886, 2966159795, 1981959565, 2602891721, 1814325871, 4148300386, 1211156469, 2945951607, 4132724234, 1221821676, 3057395063, 1563869020,
3762934166, 3303914085, 1910775932, 2241726842, 3836262483, 905479357, 2974032168, 3187395363, 3071243546, 3571439927, 3756380578, 53494506,
495375628, 2149633842, 1549467921, 403773184, 3774309942, 1767528278, 421982610, 579688614, 3735062896, 2128447283, 2545877077, 3013437905,
4067651631, 26043227, 3189924699, 1882256309, 431961449, 3637287121, 1409924095, 3834921204, 3796550515, 338734970, 1632375419, 3788135288,
153287562, 2302436235, 3852961194, 2073555800, 3034065218, 1997718747, 3343015031, 3198064720, 4286393046, 3338997777, 1383744819, 1553624825,
1183357509, 1141531260, 25823987, 2951322047, 4066666075, 3687780778, 3680053857, 478734258, 3674686218, 1457141125, 3673486342, 3224971043,
2786082270, 2282591016, 1210618789, 3735610308, 587294285, 4231880327, 3702701983, 13470000, 90747549, 876795924, 1489448380, 585176585,
2398768918, 3069244786, 2901497718, 4004899727, 1992450245, 1127097566, 713011674, 2083831719, 2923291311, 315998911, 1511233310, 1515243002,
621858088, 2398475656, 3029652473, 1011396654, 1854317252, 2735915680, 1489448619, 3836317799, 1678027486, 2429831383, 170989290, 651235170,
1457126476, 3694269669, 4248613755, 3161380741, 3396304589, 26218095, 4262314194, 3090365505, 2603976562, 1742639443, 3357356842, 2527908520,
2744118109, 764708873, 608716002, 218517036, 2028062957, 123264851, 3930797933, 1358280349, 3770182726, 1475205800, 4083653367, 728440387,
578359463, 3792859449, 2660424205, 866268419, 2680711984, 1892477918, 3473675890, 5948212, 590585309, 1434154869, 4019090587, 3447601971,
3777365598, 502271900, 933280098, 551410763, 4178545332, 2426657681, 435161245, 103552671, 2751130089, 1664159723, 2124278140, 3518289293,
1397473574, 4032873848, 3104766011, 3780526375, 146118438, 3497842141, 2078614647, 1431064844, 825222639, 954382890, 3170571595, 1418867403,
4133763948, 2773874577, 459104952, 3336058631, 791669682, 79496438, 1268256964, 1327605157, 3196785479, 3094404795, 3971934915, 967528556,
1680157581, 1508139540, 3821158380, 3603819236, 593155253, 1875654417, 3734837198, 3315972391, 2450938455, 1863178045, 619766009, 1376779265,
843230528, 1818810226, 1508689309, 1353144904, 3459699509, 734863896, 1593154156, 4178196553, 559982910, 1937392142, 3328058492, 2417976146,
3197182411, 2233439700, 196920494, 3714701774, 4104568606, 850977604, 382851029, 4143478133, 3024891142, 2455897904, 28681198, 3438784382,
578301023, 2215641381, 59642080, 2913625733, 2063824530, 2113835214, 563503294, 2261300428, 1156324177, 3080988993, 1485826140, 291045970,
3740234437, 2802003429, 804278225, 1715783317, 3683156408, 2855890524, 2390104305, 172369852, 3358371994, 1184782876, 2087670358, 840924195,
2727925375, 1806621317, 2785628046, 4163132724, 3580142689, 1107366902, 809125531, 3131770778, 1922818283, 888842000, 2875999147, 2752567229,
170460348, 1952532683, 1705378473, 1784443344, 1111435234, 2373828316, 1440965774, 3986117425, 849160375, 1233392480, 4073490673, 3948548975,
2317742686, 459747729, 3981827733, 97170450, 1906613346, 2296986726, 3107045483, 3301310854, 2005065797, 1047441812, 1340913878, 1305190832,
3414530672, 2739562683, 670592573, 3517927973, 3902124497, 4085960935, 823980090, 982263838, 1807290575, 1182843877, 3543714667, 1403590968,
329717243, 1055811172, 3550329386, 3998515559, 3251582755, 2201054306, 3347834116, 1211790680, 62972368, 88227180, 2967020240, 1937245345,
524567284, 2915223835, 1039263578, 931149438, 2102426452, 4178383760, 2534760455, 3961494901, 359726861, 2377704223, 3980574430, 3941075859,
3025460765, 1087397787, 1520908724, 3979084899, 3800423495, 139799221, 644687977, 1080267251, 599331265, 379370383, 3716980301, 2450151406,
1223752702, 300351842, 295249068, 1870733374, 2986315084, 1323736886, 306347366, 2697516131, 3896227616, 2556699990, 578928278, 2356101730,
171880210, 722319049, 740054230, 3855145369, 1468149367, 311954206, 4099077708, 2941657479, 119786529, 3197372768, 2115311247, 2469241538,
2636086203, 2206369175, 374899905, 3730393440, 2288141890, 719446033, 4096038147, 4294410470, 19272682, 1964868281, 3192582061, 3934009074,
1135732985, 682697379, 3290113635, 1489105351, 347638343, 147496092, 4175447059, 341595821, 3117140389, 1003085251, 1889252416, 913732530,
3459561042, 3662473182, 3839509269, 1519115576, 677113, 597583022, 3031451769, 607339281, 55523370, 2676982537, 1238056185, 1550912054,
3112284354, 1345961520, 1541909925, 3726796822, 2696250478, 3254836471, 1362613883, 3129122359, 1550126204, 129690651, 2386622242, 407302605,
1753882614, 2376840660, 1076064874, 2449053256, 3162294193, 3779999195, 3925427556, 2601606505, 1901788890, 2217639773, 406665902, 3640687773,
2061876750, 968895635, 587973195, 2778479214, 668417883, 2226398520, 1464491431, 2792659882, 3481258691, 2339776369, 2747947338, 3000199533,
3712567952, 376206272, 2149616269, 985682501, 865295391, 1812641626, 567425379, 1468520640, 2273677177, 2267568076, 3898328230, 898149034,
3750298043, 394538907, 4101461357, 2781824777, 2719406676, 3415420393, 122661889, 1452536307, 1463257506, 2874481787, 2250093815, 1439068642,
597070280, 1439076517, 4207797347, 2579732532, 3704826787, 3847236064, 4155289003, 990963026, 2602619627, 701644802, 3629646548, 1110000288,
3609356614, 2748019645, 638526248, 3265491895, 2839687161, 913026615, 2748040592, 975131382, 83378202, 4236013846, 764917668, 1887262417], dtype=uint32),
624,
0,
0.0)
>>> prng.random_sample()
0.20598058788141316
>>> prng.random_sample()
0.6864005375257146
>>> prng.random_sample()
0.08407651896523582
>>> prng.set_state(state)
>>> prng.random_sample()
0.20598058788141316
>>> prng.random_sample()
0.6864005375257146
You can also pickle RandomState objects. We implemented this using the get_state() data, so it will reliably reproduce the state of the PRNG. Depending on exactly what you want to do (you don't say), this is frequently the most convenient thing to do rather than mucking about with get_state() and set_state() manually.
>>> import cPickle
>>> pickled = cPickle.dumps(prng)
>>> prng.random_sample()
0.08407651896523582
>>> prng.random_sample()
0.3501860271954601
>>> prng2 = cPickle.loads(pickled)
>>> prng2.random_sample()
0.08407651896523582
>>> prng2.random_sample()
0.3501860271954601
When people need a random seed that can be recorded, people usually use the system time as a random seed. This means your program will act differently each time it is run, but can be saved and captured. Why don't you try that out?
If you don't want to do that for some reason, use the null version, numpy.random.seed(seed=None), then get a random number from it, then set the seed to that new random number.
You can't… but there's really no good reason to do so. Unless you're actually trying to reproduce the behavior of seed, rather than put the RNG into a repeatable state, you're trying to add an extra level of indirection for no reason.
If you want to stash and restore the RandomState, do that, using the get_state() and set_state() functions.
If you really want to use seed instead, you can just use np.random to generate a random seed (e.g., via random_integers(0, 255, SOME_LENGTH)), which you can stash and reuse later. But there's not much reason to do that.
Or, of course, you can call Python's os.urandom to create a seed the same way NumPy does by default. Note that the docs explicitly say that:
If seed is None, then RandomState will try to "read date from /dev/urandom (or the Windows analogue) if available or seed from the clock otherwise.
But again, there's not much reason to do that either. (Also, it isn't documented how much randomness it gets from urandom, so there's always the risk that you'll be seeding it with less random data than it normally uses, or wastefully gathering too much.)
According to the docs, when seed is None, numpy tries to read from /dev/urandom, so why not just read a value from /dev/urandom, save it, and pass it to numpy.random.RandomState?
EDIT:
The internal state can be get and set via get_state and set_state, respectively. So, to recover the initial state, one would do something like this:
>>> import numpy
>>> r = numpy.random.RandomState()
>>> saved_state = r.get_state()
>>> r.rand()
0.9091545657342729
>>> r.rand()
0.9677739782319564
>>> r.rand()
0.5656156400920441
>>> r.set_state(saved_state)
>>> r.rand()
0.9091545657342729
>>> r.rand()
0.9677739782319564
>>> r.rand()
0.5656156400920441
>>>
When seed is None, numpy doesn't pick a "new random seed" and call seed() with it. It reads 624 * sizeof(long) bytes (~ 2.5KB) from /dev/urandom and uses those values to populate the state struct. When you call seed() without arguments, numpy never actually "chooses" a "random seed". Therefore, it's not possible to recover it.
If you want you can also save it in a json file and then unpack it and then use it again. Since numpy stuff can't be serialized you need to serialize it yourself but its not that bad:
One file:
import json
import numpy as np
def put_numpy_seed_in_json_dic(results):
(rnd0,rnd1,rnd2,rnd3,rnd4) = np.random.get_state()
rnd1 = [int(number) for number in rnd1]
rand_seed = (rnd0,rnd1,rnd2,rnd3,rnd4)
results['rand_seed'] = rand_seed
return results
def get_numpy_seed(results):
(rnd0,rnd1,rnd2,rnd3,rnd4) = results['rand_seed']
rnd1 = [np.uint32(number) for number in rnd1]
rand_seed = (rnd0,rnd1,rnd2,rnd3,rnd4)
return rand_seed
then run it to save the seed:
import json
import numpy as np
import my_rand_lib as mr
results = {'rand_seed':None}
results = mr.put_numpy_seed_in_json_dic(results)
print np.random.rand(1)
print np.random.rand(1)
print np.random.rand(1)
fpath = './rand_seed_file'
with open(fpath,'w+') as f:
json.dump(results,f)
print '... doing other stuff'
with open(fpath,'r+') as f:
results2 = json.load(f)
print 'other ',np.random.rand(1)
print 'other ',np.random.rand(1)
print 'other ',np.random.rand(1)
print '... done doing stuff'
rand_seed = mr.get_numpy_seed(results2)
np.random.set_state(rand_seed)
print np.random.rand(1)
print np.random.rand(1)
print np.random.rand(1)
and if you don't want to generate a seed everytime you run it you can have:
import json
import numpy as np
import my_rand_lib as mr
fpath = './rand_seed_file'
with open(fpath,'r+') as f:
results2 = json.load(f)
rand_seed = mr.get_numpy_seed(results2)
np.random.set_state(rand_seed)
print np.random.rand(1)
print np.random.rand(1)
print np.random.rand(1)
I tried this on a remote server and I always get matching random numbers:
[ 0.90741273]
[ 0.6861296]
[ 0.21714398]
not sure if this is interesting but this was the seed (which is a tuple):
{"rand_seed": ["MT19937", [3244492226, 4276548057, 571402114, 3235873143, 4078239958, 1440625038, 4042777784, 3400010150, 1164584760, 271139028, 1264217608, 1403324904, 234696259, 623484078, 3424719234, 3896351743, 1818071683, 3077380191, 2989066157, 3828180331, 2032001745, 1137603205, 1993713826, 873523654, 3267461254, 2964954176, 3217679339, 4079232021, 1182272168, 402998421, 968119626, 2151162455, 2550226639, 3522780791, 245256811, 2866158388, 587411937, 2836234133, 3485394274, 1767143488, 3772379711, 1244725495, 1061026769, 2544419920, 3963050848, 232749713, 2084368489, 1990090546, 2883903063, 174001222, 2569537698, 517341511, 2366955295, 1830324490, 2388090514, 1637855850, 1383101875, 2719629528, 885528387, 7941101, 2769663894, 2704541593, 3129289945, 2681434614, 3308402481, 2161196492, 2896442132, 1474561199, 156414990, 2934014108, 2740454316, 4029663532, 2903418479, 118978587, 3095335574, 1044532364, 2629619463, 623783821, 3172307947, 2539001597, 2020636966, 404303542, 373288588, 289388097, 1050356390, 1126919064, 474676333, 2156863001, 92975776, 1204572119, 1341956590, 4284155262, 3380981209, 1268302262, 835613316, 623125230, 1150083001, 3444902937, 2318349536, 2881496834, 393068269, 28626933, 2931354423, 2014174400, 4212996966, 3105086458, 74404022, 413795342, 3782258177, 3626466932, 1932129332, 3538419256, 943472124, 963175815, 4076955699, 52410025, 318657184, 839799912, 2150435130, 3187525421, 2124551508, 3930704180, 2375548757, 497820208, 422355274, 260159836, 3437157934, 1301403840, 4057357702, 3217300631, 2910194797, 1972036860, 624838554, 3418367281, 3823714808, 1342594222, 3874939587, 3578421466, 3997730187, 751930224, 801189513, 1225089722, 910752086, 1415351761, 4287089458, 224210780, 643596696, 1030838729, 1924676141, 2579935013, 32904138, 2486616018, 1665731347, 642496995, 577928776, 4119274366, 1438990597, 885648199, 2401966414, 1937630298, 2029522084, 3823943785, 1652388617, 242507028, 163957584, 197993457, 3003700508, 2357598705, 479742798, 2159530434, 2641855048, 1153321528, 458640940, 1364908158, 3931878737, 3754891907, 733317650, 3631844997, 209681576, 780025499, 217109730, 2659949782, 164210317, 2234081627, 2798187303, 3793035212, 622613442, 4027945659, 1264924240, 3755962138, 168637328, 4193297896, 593711399, 2018193001, 696136156, 3343926759, 3938753383, 3549915312, 2049590636, 1732826453, 3770804132, 1544263650, 3623494103, 1454784121, 860580298, 1336846278, 3298403325, 4156569419, 51196786, 3398541940, 717201402, 1418590160, 3407195989, 293192063, 3871127471, 963318294, 3177164855, 2248856336, 2363561954, 2122436074, 3083439454, 331898151, 3489466823, 1480231253, 3727404028, 1942269624, 3342915239, 2451833278, 1279324699, 3678779848, 494256563, 170826038, 3200966622, 3284372389, 3798475074, 191206256, 1112201427, 3959301392, 43618741, 1358008929, 2972254642, 2250013335, 659600256, 720199815, 1355589829, 1511937267, 2090180739, 2779086170, 704140912, 1354505400, 4106508219, 4130987887, 1135113560, 3310205054, 2559493616, 3994237157, 2449530906, 1017478859, 2475414025, 260408932, 3882314025, 3169908095, 1431718224, 755730563, 4129813635, 482751982, 42657908, 2418940148, 2380660631, 3596648617, 2668040386, 3700947086, 1235361153, 4212839143, 2803192914, 679783840, 1396721631, 3549531060, 3714188947, 1582886984, 3930587164, 1787845200, 1878170563, 3998685888, 275016726, 1362149445, 1784854500, 3413367687, 999979145, 30464988, 1781846287, 2052179802, 614372595, 1795389478, 3837746383, 1716252322, 1496633789, 1913960414, 3824749341, 745150948, 2990885936, 3557188824, 1853716952, 226442384, 3881419361, 3877508921, 2125849259, 3725330620, 4249819850, 1866002740, 3954375926, 1263697298, 2359110923, 3704149399, 3915156522, 2720534920, 2240262865, 1298116022, 2430494738, 3106481019, 1118448263, 3386525375, 3850025930, 947096317, 2014058358, 2943385566, 1639655978, 824538918, 2893393554, 190010755, 918084027, 4197568458, 2308675470, 3969533604, 823650146, 3971685975, 3959021418, 2335451148, 3651109937, 3536101054, 2028026981, 1042621858, 2093418547, 3332527479, 345797902, 1962843497, 1651609280, 849683942, 701440541, 3001603849, 2547855201, 2847179356, 2686463194, 2556105058, 2957249371, 4122354156, 4095666057, 3269707747, 2075948426, 4189148196, 59188700, 1425136277, 4010662242, 403095998, 2435933607, 3254626634, 320429604, 921618676, 4179054005, 1590495757, 362965764, 3892792894, 4264771139, 300303781, 4194594842, 1773582295, 1792749320, 3114744569, 3059831369, 543108826, 605116437, 1206221920, 3763708911, 3474933214, 933590768, 4096747554, 2732890014, 1180321103, 3174872523, 2361419553, 303084740, 3438967187, 829657141, 3976738932, 3250508727, 2965752967, 2766618501, 2610047728, 3913791738, 2383381107, 2911412379, 2570048205, 1059652767, 1105153800, 258287599, 1366361775, 1043101709, 4136777479, 4002476750, 2242511114, 1937386895, 2318776696, 3919577988, 819932046, 2154232126, 2359937340, 1707529303, 1430709021, 57940224, 2463543918, 439698027, 2154236676, 2989369870, 2711983380, 1243586438, 1648109179, 234677646, 1369164631, 3246772730, 1150951970, 707111532, 2641066313, 1561023105, 2352529521, 3905609297, 3758075920, 4124559541, 3768803924, 3443976002, 2619333832, 3399759018, 2295667887, 4126858561, 3139541980, 2382271429, 4033423715, 648775734, 2777131955, 1929238235, 2146942632, 1115329972, 3985641642, 2007135435, 3551753547, 2967740448, 4196112540, 61581572, 886344810, 4097187928, 1166916633, 3890455280, 1473584306, 1440678763, 2848991175, 2493980496, 3967544385, 1757152663, 52315252, 2476642029, 2727074449, 4197000746, 2878883929, 144032869, 3517610268, 3758074755, 164078969, 4288210033, 1130401207, 2376285572, 3726677017, 2021546352, 2763363362, 791950895, 1834778577, 3067448324, 2618082688, 4194263605, 84230440, 624358904, 3203686228, 2014115933, 1844566018, 314698511, 1096366940, 1413533306, 2490690918, 3524310116, 3912232452, 3595400103, 2104097721, 2277699865, 2127808758, 890104002, 4261780514, 3943759279, 2421596910, 3462302371, 3114202694, 3301664792, 3958641805, 3828288008, 3138631754, 2707054121, 873889048, 360096040, 1277036249, 310404450, 1841086653, 1324064291, 1069123460, 3667889879, 3549162319, 4105010862, 3802778145, 1818048305, 3083126999, 3810922140, 2364932315, 2667079274, 3034477663, 1142598277, 2129656233, 2900596493, 1771721766, 2091125900, 2024931777, 2186939139, 4292757779, 3168005700, 2700706967, 2033965363, 2815886839, 1936909550, 1018210446, 2494829103, 3182190430, 4070030839, 3878343946, 3290625485, 2885062721, 3427598831, 3748858811, 2021454997, 2926497731, 3462334646, 747641905, 2870980834, 1072943394, 370913272, 519334913, 3099507262, 698616436, 3884871568, 2530196197, 223690634, 1816574877, 2872502342, 3629966511, 4040316403, 400367036, 1898479168, 1795033191, 4090946019, 938326326, 1509105095, 886381170, 4207241822, 2919702734, 1437184594, 2765872952, 561764883, 3441440757, 1219765705, 209412518, 1098738818, 3782425126, 3113624586, 3302772981, 1213966890, 4292826280, 4109015079, 1949958581, 320991923, 1070765942, 2002780881, 3364869673, 3039286974, 1824574474, 1266616388, 703321141, 2004303453, 1284326590, 728587648, 427042526, 2160662521, 2783764788, 3053336315, 3542331332, 2881174731, 4160514263, 3326878203, 4139791808, 2639767143, 3144886711, 480269073, 2318151636, 3594165209, 1629301762, 1786754501, 1157007028, 1415023980, 172137771, 3444342355, 3889095376], 624, 0, 0.0]}