Scrape several instances of a webpage the fastest way possible - python

So, after a lot of attempts, search and research I give up.
I have a webpage where all employees name, phone, mail and userid can be query. The way you do that is that the request to the server needs to have at least 4 digits, with all 26 ascll character + 0-9 numbers. I was able to do it with Selenium in Python...but it whould take 20 days to go through - see code.
from selenium import webdriver
import csv
alphanum = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '1',
'2', '3', '4', '5', '6', '7', '8', '9', '0']
driver = webdriver.Firefox()
driver.get('http://brnet.intra.corpintra.net/Quem/pessoas2/Default.asp')
list_base = dict()
i = 0
data_str = []
found = False
for first_chr in alphanum:
for second_chr in alphanum:
for third_chr in alphanum:
text = first_chr + second_chr + third_chr
element_name = driver.find_element_by_name('nome').clear()
element_name = driver.find_element_by_name('nome')
element_name.send_keys(text)
element_search = driver.find_element_by_name('B1')
element_search.click()
if driver.find_elements_by_class_name('dados'):
for table_data in driver.find_elements_by_class_name('dados'):
cells_table = table_data.find_elements_by_tag_name('td')
for cell_data in cells_table:
data_str.append(cell_data.text.strip())
if list_base:
for key, value in list_base.items():
for data in data_str:
if data in value:
found = False
else:
found = True
else:
found = False
if found is False:
list_base[i] = data_str
i = i+1
data_str = []
found = False
driver.back()
w = csv.writer(open("output.csv", "w"))
for key, value in list_base.items():
w.writerow([key, value])
driver.quit()
Is there a way to reduce the time?

Related

Why am I getting a garbage tokenizer?

from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.UnigramTrainer(
vocab_size=30000,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
special_tokens=["<PAD>", "<BOS>", "<EOS>", '<s>', '</s>', '<unk>', '<mask>'],
min_frequency = 2
)
def batch_iterator(batch_size=10, size=5000):
for i in range(100):
query = f"select note_text from db.note where id > {i * size} limit 50;"
df = pd.read_sql(sql=query, con=db)
for x in range(0, size, batch_size):
yield list(df['note_text'].loc[0:5000])[x:x + batch_size]
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=100*5000)
A single note may look something like this:
!~!~!~!~!~!~!~!~!~!~!~!~!~!~Discussing settlement with Amy.!~!~
The output looks as follows:
out = tokenizer.encode('There should be an inspection come Monday 1/2/2022!')
['ĠThe', 'r', 'e', 'Ġsh', 'ould', 'Ġbe', 'Ġan', 'Ġinspect', 'ion', 'Ġ', 'com', 'e', 'Ġ', 'M', 'ond', 'a', 'y', 'Ġ', '1', '/', '2', '/', '20', '2', '2', '!']

Python only encryption/obfuscation

I'm looking for simple password-based obfuscation/security of strings.
I've pretty much gone over each example of > Simple way to encode a string according to a password?
And none of them work with my python 3.7.
I got the error with ord() so I updated the code, but even after, its still broken. For examle:
from itertools import cycle
def encode_zip_cycle(key, clear):
enc = [chr((ord(clear_char) + ord(key_char)) % 256)
for clear_char, key_char in zip(clear, cycle(key))]
return base64.urlsafe_b64encode("".join(enc).encode())
def decode_zip_cycle(key, enc):
enc = base64.urlsafe_b64decode(enc)
dec = [chr((256 + enc_char - ord(key_char)) % 256)
for enc_char, key_char in zip(enc, cycle(key))]
print(dec)
return "".join(dec)
text = "ATTACKATONCEfor Live 2154125-21-512^!££613-123!"
s = "1235348udgfjff"
print("Text : " + text)
print("Shift : " + str(s))
print("Cipher: ", encode_zip_cycle(s, text)) # , type(encode(s, text)))
print("Original text: ", decode_zip_cycle(s, encode_zip_cycle(s, text)))
Gives me
Text : ATTACKATONCEfor Live 2154125-21-512^!££613-123!
Shift : 1235348udgfjff
Cipher: b'csKGwod2dn95w4nCs8K1wqnCr8OMw5XCo1J_wp7CqcKZWMKVwoTCmcKXwp_CmsKXY2dgZ2RhbcKmwpbDhcKHDQnCnGJlYGZlZ1k='
['A', '\x90', 'S', '\x8d', 'T', 'B', '>', '\n', '\x15', '\\', '#', 'X', 'M', '\\', '\x84', '\x90', 'v', '\x8d', '|', '\x8f', 'T', 'N', '1', '[', '=', 'è', '\x19', '\\', 'm', '\x90', 'v', '\x8d', 'f', '$', '\x8a', ' ', '^', '\x1d', '\\', '/', '\\', '1', '\x91', 'm', '\x8f', 'e', '\x8f', 'c', '+', 'ò', 'ü', '\x00', 'þ', '÷', '\x07', '\\', 'u', '\x90', 'c', '\x8e', 'R', '\x8e', 'O', '\x98', '¥', '[', '6', 'ø', 'ÿ', 'ú', '5', '3', '4', '$']
Original text: ASTB>
\#XM\v|TN1[=è\mvf$ ^\/\1mec+òü þ÷\ucRO¥[6øÿú534$
In encode_zip_cycle you encode the "encrypted" string into utf-8 before doing the second encoding into base64. Yet, you don't revert this operation later in decode_zip_cycle.
This is the correct decode_zip_cycle function:
def decode_zip_cycle(key, enc):
enc = base64.urlsafe_b64decode(enc).decode()
dec = [chr((256 + ord(enc_char) - ord(key_char)) % 256)
for enc_char, key_char in zip(enc, cycle(key))]
print(dec)
return "".join(dec)

How to manually select elements from a list encoded in utf-8?

I have a list which looks like this:
['क', ',', 'म', '-', 'ह', 'औ', "'", ')', '(', 'स', '.', 'ए', 'प', 'श',
'भ', 'ल', 'य', 'न', 'इ', '}', 'ज', 'र', 'उ', 'ग', 'द', 'त', 't', 'थ',
'ब', 'अ', 'ई', 'o', '%', 'व', 'a', 'आ', '#', '–', 'q', 'i', '।', '/',
'ओ', 'फ', 'f', 's', 'u', '!', '?', 'ध', 'ऐ', '१', '+', '२', 'p', 'd',
'j', 'च', 'ऑ', 'b', 'छ', 'ऊ', 'l', 'e', 'w', 'ख', 'घ', 'c', 'r', 'y',
'g', 'n', 'ट', 'ड', 'x', '५', '"', '३', 'm', 'ठ', 'h', '४', '•', '$',
'>', 'v', 'z', 'झ', '७', '—', '६', 'k', 'ढ', '८', '&', 'ऋ', '\', '९',
'✉', '०', '॥', '°', '^', '~', '−', '·', 'ॐ', '×', '_', '→', '☆', '£',
'€', 'α', '‘', 'ष', '±', '†', 'β', '#', '\u200e', '░', '¬', '₹', 'π',
'½', '…', 'ऍ', 'º', 'σ', 'γ', 'δ', 'ऽ', '0', '²', 'ङ', 'ॠ', 'à', '≥',
'ः', 'ऎ', 'ω', 'μ', '{', 'ण', 'ं', '≈', 'ε', 'λ', 'θ', '्', '<', '↑',
'\uf0a7', 'φ', '\u200b', '📝', 'ञ', 'о', 'ƒ', '©', '←', 'ळ', 'ा', '■',
'¢', 'ρ', '∞', 'î', '⁄', '√', 'ব', '§', '¾', '≤', '॰', '্', 'é', 'و',
'`', '¥', '♂', '₩', 'å', '´', 'ü', 'á', 'ó', 'ভ', 'в', '¼', '़', 'è',
'ʁ', 'े', 'и', '≡', 'ζ', 'í', '↙', '″', '\u200d', '₫', 'م', '»', 'ː',
'‡', 'ö', 'ँ', 'د', 'η', 'ð', '♦', 'শ', 'প', 'ी', 'ú', '⅓', 'ب', '≠',
'κ', '∈', 'ç', '�', 'এ', 'উ', 'র', 'ν', 'â', 'ê', 'ū', 'к', 'ø', 'ù',
'ā', 'ä', 'æ', '↓', 'ô', 'স', 'ो', 'ō', '●', 'ē', '₨', 'ि', '„', 'ī',
'฿', 'ò', 'τ', 'ਸ', 'ऩ', 'ª', 'р', 'ত', 'н', 'с', 'χ', 'ಕ', 'ë', 'ॉ',
'خ', 'ا', 'ψ', 'а', '¿', 'ì', 'ý', 'µ', 'ौ', 'š', '‰', '÷', 'ি', 'আ',
'ə', 'у', '★', 'ل', 'ॡ', 'č', '⊕', 'ृ', 'ñ', 'ै', 'û', 'ु', 'ू', 'м',
'þ', 'г', 'ι', 'മ', 'پ', '☉', 'த', 'ш', '¤', 'е', '📥', 'ş', 'ã', 'ž',
'খ', 'ع', '水', 'õ', 'ক', 'д', 'ï', 'ج', 'ধ', 'ж', 'ऒ', 'ଓ', '¹', 'ॅ',
'ħ', 'ন', 'ও', 'ʃ', '\u200c', 'ز', 'х', 'מ', '⊂', 'ф', 'য', 'എ',
'\u202a', 'ষ', '九', '♣', 'ł', 'ऌ', 'া', 'త', 'з', 'ß', 'ف', 'ר', '―',
'п', 'غ', 'ऱ', 'ر', 'ŋ', 'ϕ', 'ऴ', 'ክ', 'ğ', 'ą', 'ś', 'ę', '¨', 'ч',
'ż', '№', 'س', 'œ', 'ă', '♠', 'ش', '◾', '\uf0fc', 'ÿ', 'ש', '\ufeff',
'ಜ', 'ن', 'ʊ', '³', 'ć', 'آ', 'চ', 'ɛ', 'я', '이', 'đ', 'জ', 'ġ', '⅜',
'ɑ', '˚', 'ξ', 'л', 'б', 'т', 'ц', '∟', 'й', 'ಭ', 'സ', 'ɒ', 'అ', 'న',
'⍨', '⌛', '⌣', 'ĝ', 'ő', 'ĉ', '،', 'ċ', 'ĵ', 'ژ', 'হ', 'ŝ', 'ণ', '沖',
'⅞', 'ت', '١', 'ɸ', 'ɪ', 'ʌ', 'ě', 'ע', '¸', 'দ', 'ŭ', '∧', 'أ', 'અ',
'ɨ', 'ĥ', '∀', 'ગ', 'ű', 'ʒ', 'ح', 'ث', '█', '∩', 'ق', '↔', '®', 'ਪ',
'⇒', '⅔', '∑', '⇔', 'ழ', 'ю', '月', 'ذ', 'ǻ', 'ń', '∪', 'ك', 'ʰ', 'ё',
'э', '™', 'ض', 'ਦ', 'ɹ', '☞', 'ঞ', 'ম', 'ু', '②', '道', 'ӏ', '大', 'щ',
'א', 'ʔ', 'ǁ', 'ה', '∂', 'ţ', 'പ', '∨', '성', 'ỳ', 'ബ', '∇', 'ظ', 'ط',
'ẽ', 'ص', 'ಶ', 'υ', 'ഞ', 'қ', 'ỹ', 'ź']
I need to select only the Devanagari characters (i.e. 'अ', 'आ'...'क','ख','घ'..) out of above characters. Is there any other way to do this in python except that of manually selecting the indices?
You can iterate through each item in the list, convert each character to a unicode number using ord, then test whether it lies within the Devanagari unicode character range (see: https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)). If it does, add it to the output list.
It will end up something like this:
newlist = []
for c in list:
if ord(c) >= 2304 and ord(c) <= 2431:
newlist.append(c)

AWS-BOTO security group error

I have the following code to spin up new instances:
conn = boto.ec2.connect_to_region("us-east-1", security_token = "xx", aws_access_key_id= "xx", aws_secret_access_key= "xx")
security_groups = conn.get_all_security_groups()
for security_group in security_groups:
if str(security_group)[14:] == "xx":
conn.run_instances(
'ami-da2cd5b2',
key_name='fornax_keypair',
instance_type='c1.xlarge',
security_groups=security_group)
else:
continue
It finds the security group and then gives the error :
TypeError: 'SecurityGroup' object is not iterable
If I change it to str(security_group), it then gives the error:
<Response><Errors><Error><Code>InvalidGroup.NotFound</Code><Message>The security groups 'f', 'g', 'd', 'e', 'c', 'n', 'o', 'j', '.', 'i', 'v', 'u', 't', 's', 'r', 'p', '
:', 'y' do not exist</Message></Error></Errors><RequestID>c96afd3c-de3f-4441-be65-c6a85fbe7868</RequestID></Response>
Also how do I attach the connection to an already established vpc connection and subnet ?
The security_groups parameter to run_instances is supposed to be list of security group names. You are passing a scalar string value. Try this instead:
conn.run_instances(
'ami-da2cd5b2',
key_name='fornax_keypair,
instance_type='c1.xlarge',
security_groups=[security_group])

Is the cc recipients in a received email a Python list? (Google App Engine)

I am trying to pull the cc'ed email addresses from received email. I am working in the development server.
The tutorial says that "cc contains a list of the cc recipients." But it seems that message.cc returns a string. I am just using the code I copied from the cookbook:
class ReceiveEmail(InboundMailHandler):
def receive(self, message):
logging.info("Received email from %s" % message.sender)
plaintext = message.bodies(content_type='text/plain')
for text in plaintext:
txtmsg = ""
txtmsg = text[1].decode()
logging.info("Body is %s" % txtmsg)
logging.info("CC email is %s" % message.cc)
So if I have 1 cc, the log shows:
CC email is cc12#example.com
If there are more than 1:
CC email is cc12#example.com, cc13#example.com
To get the first email "cc12#example.com", I tried:
logging.info("CC email is %s" % message.cc[0])
but this gives:
CC email is c
so the result is treated as a string.
When I try
logging.info("CC email is %s" % list(message.cc)
I get
['c', 'c', '1', '2', '#', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ',', ' ', 'c', 'c', '1', '3', '#', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ',', ' ', 'c', 'c', '1', '4', '#', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm'
Again, it appears that message.cc returns string not list.
Do I need to use regex to get the emails? Any suggestions about what I am doing wrong? Thanks!
Try:
cc_list = message.cc.split(',')
cc
A recipient's email address (a string) or a list of email addresses to appear on the Cc: line in the message header.
Message Fields
cc is a string
message.cc.split(", ")[0] is "cc12#example.com" that you want.

Categories