Combining three lists in Python with sorting - python

How to efficiently and smartly combine 3 lists in the way like below?
sex = ['M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'F']
actresses = ['Natalie Portman', 'Anne Hathaway', 'Talia Shire', 'Diane Keaton', 'Keira Knightley', 'Uma Thurman']
actors = ['Morgan Freeman', 'Leonardo DiCaprio', 'Robert De Niro', 'Brad Pitt']
Result:
[('M', 'Morgan Freeman'),
('M', 'Leonardo DiCaprio'),
('F', 'Natalie Portman'),
('F', 'Anne Hathaway'),
('M', 'Robert De Niro'),
('F', 'Talia Shire'),
('M', 'Brad Pitt'),
('F', 'Diane Keaton'),
('F', 'Keira Knightley'),
('F', 'Uma Thurman')]
My solution:
sex = ['M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'F']
actresses = ['Natalie Portman', 'Anne Hathaway', 'Talia Shire', 'Diane Keaton', 'Keira Knightley', 'Uma Thurman', ]
actors = ['Morgan Freeman', 'Leonardo DiCaprio', 'Robert De Niro', 'Brad Pitt']
result = []
for s in sex:
if s == 'F':
result.append((s, actresses.pop(0)))
elif s == 'M':
result.append((s, actors.pop(0)))
print(f'result = {result}')
What is the best way for a long lists (e.g. 1 million items)?

You can place references to the lists in a dictionary and do a list comprehension
In [8]: sexes = ['M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'F']
...: actresses = ['Natalie Portman', 'Anne Hathaway', 'Talia Shire', 'Diane Keaton', 'Keira Knightley', 'Uma Thurman', ]
...: actors = ['Morgan Freeman', 'Leonardo DiCaprio', 'Robert De Niro', 'Brad Pitt']
...:
...: mf = {'M':iter(actors), 'F':iter(actresses)}
...: [(sex, next(mf[sex])) for sex in sexes]
Out[8]:
[('M', 'Morgan Freeman'),
('M', 'Leonardo DiCaprio'),
('F', 'Natalie Portman'),
('F', 'Anne Hathaway'),
('M', 'Robert De Niro'),
('F', 'Talia Shire'),
('M', 'Brad Pitt'),
('F', 'Diane Keaton'),
('F', 'Keira Knightley'),
('F', 'Uma Thurman')]
In [9]:
If your list are longish and you are going to consume one pair sex-person at once you can use a generator expression in place of the list comprehension
pairs = ((sex, next(mf[s])) for sex in sexes)
for sex, person in pairs:
...
or possibly even simpler
for sex in sexes:
person = next(mf[sex])
...
If your lists were stored on disk you can use the same pattern introduced above but using generator expressions in place of lists
mf = {'M':(line.strip() for line in open('male_performers.txt'),
'F':(line.strip() for line in open('female_performers.txt')}
sexes = (line.strip() for line in open('sexes.txt'))
for sex in sexes:
performer = next(mf[sex])

You are popping from starting of the list which has time complexity of O(N). What you could do instead is keep an index for both actors and actresses lists and increment them in the loop.
sex = ['M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'F']
actresses = ['Natalie Portman', 'Anne Hathaway', 'Talia Shire', 'Diane Keaton', 'Keira Knightley', 'Uma Thurman', ]
actors = ['Morgan Freeman', 'Leonardo DiCaprio', 'Robert De Niro', 'Brad Pitt']
result = []
actors_i = 0
actresses_i = 0
for s in sex:
if s == 'F':
result.append((s, actresses[actresses_i]))
actresses_i += 1
elif s == 'M':
result.append((s, actors[actors_i]))
actors_i += 1
print(f'result = {result}')
After this point, I don't think there are any improvements left other than making your code more readable because you have to go over every item in the sex list and you are using operations which has cost of O(1) in the loop. So the complexity is O(N).

Given that all actors have a label of 'M' and all actresses have a label of 'F', you could use pandas to group the information in a way that should have faster performance than looping through large lists.
Here is an example:
import pandas as pd
actresses = ['Natalie Portman', 'Anne Hathaway', 'Talia Shire', 'Diane Keaton', 'Keira Knightley', 'Uma Thurman', ]
actors = ['Morgan Freeman', 'Leonardo DiCaprio', 'Robert De Niro', 'Brad Pitt']
df_actresses = pd.DataFrame(actresses, columns=['name'])
df_actors = pd.DataFrame(actors, columns=['name'])
df_actresses['sex'] = 'F'
df_actors['sex'] = 'M'
df = pd.concat([df_actresses, df_actors], axis=0)
# if you really need it to be a list
result = df.values.tolist()

Thank you for all the answers. Yes, using pop(0) was a very bad idea in this case. I tried to compare all solutions for 1 million pseudo items. In my opinion the results were very good except for the use of pop(0).
Results:
combine_with_pop Items = 1000000. Average time: 45.49504270553589 secs
combine_without_pop Items = 1000000. Average time: 0.33301634788513185 secs
combine_dict Items = 1000000. Average time: 0.21431212425231932 secs
combine_generator Items = 1000000. Average time: 0.2770370960235596 secs
combine_frames Items = 1000000. Average time: 0.06862187385559082 secs
Test:
import pandas as pd
import string
import random
import time
import inspect
from statistics import mean
result_size = 1000000
g_number_of_repetitions = 5
def init():
# Generate sexes
population = ('M', 'F')
male_weight = 0.48
weights = (0.4, 1 - male_weight)
actresses = []
actors = []
sexes = random.choices(population, weights, k=result_size)
male_amount = sexes.count('M')
female_amount = result_size - male_amount
# Generate pseudo 'actresses' and 'actors'
act_len = 20
for a in range(female_amount):
actresses.append(''.join(random.choices(string.ascii_lowercase, k=act_len)))
for a in range(male_amount):
actors.append(''.join(random.choices(string.ascii_lowercase, k=act_len)))
return sexes, actresses, actors
def combine_with_pop(number_of_repetitions, sexes, random_actresses, random_actors):
time_measurements = []
for i in range(number_of_repetitions):
actors = random_actors[:]
actresses = random_actresses[:]
result = []
t0 = time.time()
for s in sexes:
if s == 'F':
result.append((s, actresses.pop(0)))
elif s == 'M':
result.append((s, actors.pop(0)))
time_one_round = time.time() - t0
time_measurements.append(time_one_round)
print(
f'{inspect.currentframe().f_code.co_name.ljust(20)} '
f'Items = {result_size}. Average time: {str(mean(time_measurements))} secs')
def combine_without_pop(number_of_repetitions, sexes, random_actresses, random_actors):
time_measurements = []
for i in range(number_of_repetitions):
actors = random_actors[:]
actresses = random_actresses[:]
result = []
actors_i = 0
actresses_i = 0
t0 = time.time()
for s in sexes:
if s == 'F':
result.append((s, actresses[actresses_i]))
actresses_i += 1
elif s == 'M':
result.append((s, actors[actors_i]))
actors_i += 1
time_one_round = time.time() - t0
time_measurements.append(time_one_round)
print(
f'{inspect.currentframe().f_code.co_name.ljust(20)} '
f'Items = {result_size}. Average time: {str(mean(time_measurements))} secs')
def combine_dict(number_of_repetitions, sexes, random_actresses, random_actors):
time_measurements = []
for i in range(number_of_repetitions):
actors = random_actors[:]
actresses = random_actresses[:]
result = []
t0 = time.time()
mf = {'M': iter(actors), 'F': iter(actresses)}
result = [(sex, next(mf[sex])) for sex in sexes]
time_one_round = time.time() - t0
time_measurements.append(time_one_round)
print(
f'{inspect.currentframe().f_code.co_name.ljust(20)} '
f'Items = {result_size}. Average time: {str(mean(time_measurements))} secs')
def combine_generator(number_of_repetitions, sexes, random_actresses, random_actors):
time_measurements = []
for i in range(number_of_repetitions):
actors = random_actors[:]
actresses = random_actresses[:]
result = []
t0 = time.time()
mf = {'M': iter(actors), 'F': iter(actresses)}
for sex in sexes:
person = next(mf[sex])
result.append((sex, person))
time_one_round = time.time() - t0
time_measurements.append(time_one_round)
print(
f'{inspect.currentframe().f_code.co_name.ljust(20)} '
f'Items = {result_size}. Average time: {str(mean(time_measurements))} secs')
def combine_frames(number_of_repetitions, sexes, random_actresses, random_actors):
time_measurements = []
for i in range(number_of_repetitions):
actors = random_actors[:]
actresses = random_actresses[:]
result = []
df_actresses = pd.DataFrame(actresses, columns=['name'])
df_actors = pd.DataFrame(actors, columns=['name'])
t0 = time.time()
df_actresses['sex'] = 'F'
df_actors['sex'] = 'M'
df = pd.concat([df_actresses, df_actors], axis=0)
# if you really need it to be a list
# result = df.values.tolist()
time_one_round = time.time() - t0
time_measurements.append(time_one_round)
print(
f'{inspect.currentframe().f_code.co_name.ljust(20)} '
f'Items = {result_size}. Average time: {str(mean(time_measurements))} secs')
g_sexes, g_actresses, g_actors = init()
combine_with_pop(g_number_of_repetitions, g_sexes, g_actresses, g_actors)
combine_without_pop(g_number_of_repetitions, g_sexes, g_actresses, g_actors)
combine_dict(g_number_of_repetitions, g_sexes, g_actresses, g_actors)
combine_generator(g_number_of_repetitions, g_sexes, g_actresses, g_actors)
combine_frames(g_number_of_repetitions, g_sexes, g_actresses, g_actors)

Related

Why am I getting a garbage tokenizer?

from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.UnigramTrainer(
vocab_size=30000,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
special_tokens=["<PAD>", "<BOS>", "<EOS>", '<s>', '</s>', '<unk>', '<mask>'],
min_frequency = 2
)
def batch_iterator(batch_size=10, size=5000):
for i in range(100):
query = f"select note_text from db.note where id > {i * size} limit 50;"
df = pd.read_sql(sql=query, con=db)
for x in range(0, size, batch_size):
yield list(df['note_text'].loc[0:5000])[x:x + batch_size]
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=100*5000)
A single note may look something like this:
!~!~!~!~!~!~!~!~!~!~!~!~!~!~Discussing settlement with Amy.!~!~
The output looks as follows:
out = tokenizer.encode('There should be an inspection come Monday 1/2/2022!')
['ĠThe', 'r', 'e', 'Ġsh', 'ould', 'Ġbe', 'Ġan', 'Ġinspect', 'ion', 'Ġ', 'com', 'e', 'Ġ', 'M', 'ond', 'a', 'y', 'Ġ', '1', '/', '2', '/', '20', '2', '2', '!']

Removing accents from keyword strings

This is a word processing code for chabot, in it it removes some articles and prepositions to make it easier for the bot to read
import json
from random import choice
class ChatterMessage:
def __init__(self, raw):
self.raw = str(raw).lower()
self.processed_str = self.reduce()
self.responses = self.get_responses()
self.data = self.process_response()
self.response = choice(self.data['response'])
def remove_unwanted_chars(self, string):
list_of_chars = ['?', ".", ",", "!", "#", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
new_str = ""
for char in string:
if char not in list_of_chars:
new_str += str(char)
return new_str
def get_responses(self, response_file="info.json"):
with open(response_file, 'r') as file:
return json.loads(file.read())
def reduce(self):
stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
custom_filter = []
keywords_list = []
strlist = self.raw.split(" ")
for x in strlist:
if x not in stopwords and x not in custom_filter:
keywords_list.append(self.remove_unwanted_chars(x))
return keywords_list
def process_response(self):
percentage = lambda x, y: (100 * y) / x
total = sum(len(x['keywords']) for x in self.responses)
most_acc = 0
response_data = None
acc = 0
for value in self.responses:
c = 0
for x in value['keywords']:
if str(x).lower() in self.processed_str:
c += 1
if c > most_acc:
most_acc = c
acc = percentage(total, most_acc)
print(acc)
response_data = value
if acc < 6:
return {"response": "Sorry, I do not understand. Be more clear please"}
for x in self.processed_str:
if x not in response_data['keywords']:
response_data['keywords'].append(x)
return response_data
if __name__ == '__main__':
while True:
k = input("Você: ")
res = ChatterMessage(k)
.response
print("Bot:", res)
How to remove accents from keyword strings to "make it easier" for chatbot to read? I found this explanation: How to remove string accents using Python 3? But I don't know how it would be applied to this code as the bot always stops responding
You could use the Python package unidecode that replaces special characters with ASCII equivalents.
from unidecode import unidecode
text = "Björn, Łukasz and Σωκράτης."
print(unidecode(text))
# ==> Bjorn, Lukasz and Sokrates.
You could apply this to both the input and keywords.
# In the function definition of reduce(), place this line of code after
# stopwords = ['de', 'a', 'o', .....])
stopwords = [unidecode(s) for s in stopwords]
# In "__main__": replace k = input("Você: ") with the following line of code.
k = unidecode(input("Você: "))
If it makes sense, you could also force the strings to be all lowercase. This will make your string comparisons even more robust.
k = unidecode(input("Você: ").lower())
Because you requested the entire code:
import json
from random import choice
from unidecode import unidecode
class ChatterMessage:
def __init__(self, raw):
self.raw = str(raw).lower()
self.processed_str = self.reduce()
self.responses = self.get_responses()
self.data = self.process_response()
self.response = choice(self.data['response'])
def remove_unwanted_chars(self, string):
list_of_chars = ['?', ".", ",", "!", "#", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
new_str = ""
for char in string:
if char not in list_of_chars:
new_str += str(char)
return new_str
def get_responses(self, response_file="info.json"):
with open(response_file, 'r') as file:
return json.loads(file.read())
def reduce(self):
stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
stopwords = [unidecode(s) for s in stopwords]
custom_filter = []
keywords_list = []
strlist = self.raw.split(" ")
for x in strlist:
if x not in stopwords and x not in custom_filter:
keywords_list.append(self.remove_unwanted_chars(x))
return keywords_list
def process_response(self):
percentage = lambda x, y: (100 * y) / x
total = sum(len(x['keywords']) for x in self.responses)
most_acc = 0
response_data = None
acc = 0
for value in self.responses:
c = 0
for x in value['keywords']:
if str(x).lower() in self.processed_str:
c += 1
if c > most_acc:
most_acc = c
acc = percentage(total, most_acc)
print(acc)
response_data = value
if acc < 6:
return {"response": "Sorry, I do not understand. Be more clear please"}
for x in self.processed_str:
if x not in response_data['keywords']:
response_data['keywords'].append(x)
return response_data
if __name__ == '__main__':
while True:
k = unidecode(input("Você: "))
res = ChatterMessage(k).response
print("Bot:", res)

How can I compare 2 indexes in same list in Python?

code_length = len(morse_list)
morse_length = len(morse_code)
for i in range(code_length):
for j in range(morse_length):
if morse_list[i] == morse_code[j][1]:
translated += morse_code[j][0]
if morse_list[i] == '':
translated += ' '
elif morse_list[i] and morse_list[i+1] == '': <<<<<<<<<<<<
translated += '\n'
IndexError: list index out of range.
What I am trying to do is, if there are two enters in-a-row on user input, I want to put enter(\n) which is after finish the N sentence. I am sure there is the better way but I can't figure it out. Any suggestions, please?
Edit for example, if in the list
['.-', '-...', '-.-.', '-..', '', '', '.', '..-.', '', '', '--.'],
output: abcd\n ef\n g
Edit2,
morse_code = (
('a', '.-'), ('b', '-...'), ('c', '-.-.'), ('d', '-..'),
('e', '.'), ('f', '..-.'), ('g', '--.'), ('h', '....'), ('i', '..'),
('j', '.---'), ('k', '-.-'), ('l', '.-..'), ('m', '--'), ('n', '-.'),
('o', '---'), ('p', '.--.'), ('q', '--.-'), ('r', '.-.'),
('s', '...'), ('t', '-'), ('u', '..-'), ('v', '...-'), ('w', '.--'),
('x', '-..-'), ('y', '-.--'), ('z', '--..')
)
morse_list = [] # user input words
Your morse_list [i + 1] doesn't exist if i == code_length - 1,
since you then try to retrieve morse_list [code_length].
The indexes only run from 0 to code_length - 1.
I'm not satisfied with this, but it works and will get the output, some one about to school me haha
morse_code = (
('a', '.-'), ('b', '-...'), ('c', '-.-.'), ('d', '-..'),
('e', '.'), ('f', '..-.'), ('g', '--.'), ('h', '....'), ('i', '..'),
('j', '.---'), ('k', '-.-'), ('l', '.-..'), ('m', '--'), ('n', '-.'),
('o', '---'), ('p', '.--.'), ('q', '--.-'), ('r', '.-.'),
('s', '...'), ('t', '-'), ('u', '..-'), ('v', '...-'), ('w', '.--'),
('x', '-..-'), ('y', '-.--'), ('z', '--..')
)
test = ['.-', '-...', '-.-.', '-..', '', '', '.', '..-.', '', '', '--.']
output = []
for index, item in enumerate(test):
for a in morse_code:
if item == a[1]:
output.append(a[0])
elif item == '' and test[index +1] == '':
output.append('\n')
break
else:
pass
print(''.join(output))
(xenial)vash#localhost:~/python/stack_overflow$ python3.7 morse.py
abcd
ef
g
I would put morse code into dict.
morse = {'.-': 'a',
'-...': 'b',
'-.-.': 'c',
'-..': 'd',
'': ' '}
morse_list = ['.-', '-...', '-.-.', '-..', '', '', '.-', '-...']
message = ''
for item in morse_list:
message += morse[item]
message = message.replace(' ', '\\n ')
print(message)
output: abcd\n ab
I see. Here is a list version. The 'if' can be removed from the loop if We could put this (' ','') into 'morse_code'.
morse_code = (
('a', '.-'), ('b', '-...'), ('c', '-.-.'), ('d', '-..'),
('e', '.'), ('f', '..-.'), ('g', '--.'), ('h', '....'), ('i', '..'),
('j', '.---'), ('k', '-.-'), ('l', '.-..'), ('m', '--'), ('n', '-.'),
('o', '---'), ('p', '.--.'), ('q', '--.-'), ('r', '.-.'),
('s', '...'), ('t', '-'), ('u', '..-'), ('v', '...-'), ('w', '.--'),
('x', '-..-'), ('y', '-.--'), ('z', '--..')
)
morse_list = ['.-', '', '-...', '-.-.', '-..', '', '', '.-', '-...']
message = ''
for item in morse_list:
if not item:
message += ' '
continue
message += next(filter(lambda m: m[1] == item, morse_code))[0]
message = message.replace(' ', '\n')
print(message)

Scrape several instances of a webpage the fastest way possible

So, after a lot of attempts, search and research I give up.
I have a webpage where all employees name, phone, mail and userid can be query. The way you do that is that the request to the server needs to have at least 4 digits, with all 26 ascll character + 0-9 numbers. I was able to do it with Selenium in Python...but it whould take 20 days to go through - see code.
from selenium import webdriver
import csv
alphanum = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '1',
'2', '3', '4', '5', '6', '7', '8', '9', '0']
driver = webdriver.Firefox()
driver.get('http://brnet.intra.corpintra.net/Quem/pessoas2/Default.asp')
list_base = dict()
i = 0
data_str = []
found = False
for first_chr in alphanum:
for second_chr in alphanum:
for third_chr in alphanum:
text = first_chr + second_chr + third_chr
element_name = driver.find_element_by_name('nome').clear()
element_name = driver.find_element_by_name('nome')
element_name.send_keys(text)
element_search = driver.find_element_by_name('B1')
element_search.click()
if driver.find_elements_by_class_name('dados'):
for table_data in driver.find_elements_by_class_name('dados'):
cells_table = table_data.find_elements_by_tag_name('td')
for cell_data in cells_table:
data_str.append(cell_data.text.strip())
if list_base:
for key, value in list_base.items():
for data in data_str:
if data in value:
found = False
else:
found = True
else:
found = False
if found is False:
list_base[i] = data_str
i = i+1
data_str = []
found = False
driver.back()
w = csv.writer(open("output.csv", "w"))
for key, value in list_base.items():
w.writerow([key, value])
driver.quit()
Is there a way to reduce the time?

How to parse lines in file in sequence/match adequate results using python, pyparsing?

Here's my code:
from pyparsing import *
survey ='''
BREAK_L,PN1000,LA55.16469813,LN18.15054629
PN1,LA54.16469813,LN17.15054629,EL22.222
BREAK_L,PN2000,LA55.16507249,LN18.15125566
PN6,LA54.16506873,LN17.15115798,EL33.333
PN7,LA54.16507249,LN17.15125566,EL44.444
BREAK_L,PN3000,LA55.16507249,LN18.15125566
PN10,LA54.16507522,LN17.15198405,EL55.555
PN11,LA54.16506566,LN17.15139220,EL44.44
PN12,LA54.16517275,LN17.15100652,EL11.111
'''
digits = "0123456789"
number = Word(nums+'.').setParseAction(lambda t: float(t[0]))
num = Word(digits)
text = Word(alphas)
pt_id = Suppress('PN') + Combine(Optional(text) + num + Optional(text) + Optional(num))
separator = Suppress(',')
latitude = Suppress('LA') + number
longitude = Suppress('LN') + number
gps_line = pt_id + separator + latitude + separator + longitude
break_line = (Suppress('BREAK_L,')
+ pt_id
+ separator
+ latitude
+ separator
+ longitude)
result1 = gps_line.scanString(survey)
result2 = break_line.scanString(survey)
for item in result1:
print item
With example above I would like to find solution how to get output like:
gps_line + it's break_line, what means something like in pseudo code:
for every gps_line in result1:
print gps_line + precedent break_line
If matter of my question is not clear or not fit to description, feel free to change it.
EDIT #2
What I try to achieve is output:
['1', 54.16469813, 17.15054629, 22.222, 'BP1000', 55.16469813, 18.15054629]
['6', 54.16506873, 17.15115798, 33.333, 'BP2000', 55.16507249, 18.15125566]
['7', 54.16507249, 17.15125566, 44.444, 'BP2000', 55.16507249, 18.15125566]
['10', 54.16507522, 17.15198405, 55.555, 'BP3000', 55.16507249, 18.15125566]
['11', 54.16506566, 17.1513922, 44.44, 'BP3000', 55.16507249, 18.15125566]
['12', 54.16517275, 17.15100652, 11.111, 'BP3000', 55.16507249, 18.15125566]
Second attempt:
from decimal import Decimal
from operator import itemgetter
survey ='''
BREAK_L,PN1000,LA55.16469813,LN18.15054629
PN1,LA54.16469813,LN17.15054629,EL22.222
BREAK_L,PN2000,LA55.16507249,LN18.15125566
PN6,LA54.16506873,LN17.15115798,EL33.333
PN7,LA54.16507249,LN17.15125566,EL44.444
BREAK_L,PN3000,LA55.16507249,LN18.15125566
PN10,LA54.16507522,LN17.15198405,EL55.555
PN11,LA54.16506566,LN17.15139220,EL44.44
PN12,LA54.16517275,LN17.15100652,EL11.111
'''
def parse_line(line):
brk = False
kv = {}
for part in line.split(','):
if part == 'BREAK_L':
brk = True
else:
k = part[:2]
v = part[2:]
kv[k] = v
return (brk,kv)
def parse_survey(survey):
ig1 = itemgetter('PN','LA','LN','EL')
ig2 = itemgetter('PN','LA','LN')
brk_data = None
for line in survey.strip().splitlines():
brk, data = parse_line(line)
if brk:
brk_data = data
continue
else:
yield ig1(data) + ig2(brk_data)
for r in parse_survey(survey):
print r
Yields:
('1', '54.16469813', '17.15054629', '22.222', '1000', '55.16469813', '18.15054629')
('6', '54.16506873', '17.15115798', '33.333', '2000', '55.16507249', '18.15125566')
('7', '54.16507249', '17.15125566', '44.444', '2000', '55.16507249', '18.15125566')
('10', '54.16507522', '17.15198405', '55.555', '3000', '55.16507249', '18.15125566')
('11', '54.16506566', '17.15139220', '44.44', '3000', '55.16507249', '18.15125566')
('12', '54.16517275', '17.15100652', '11.111', '3000', '55.16507249', '18.15125566')
This is really not much different to my previous attempt. I'd already paired the data for you. I assume you'll be able to change 1000 into BP1000 yourself.

Categories