Why linkedin_scraper scrape same LinkedIn profile on different URL

Why linkedin_scraper scrape same LinkedIn profile on different URL - python

from linkedin_scraper import Person
for line in f: #line is url
try:
person = Person(line, driver=browser, scrape=True, close_on_complete=False)
print(person.name)
print(person.company)
print(person.job_title)
print(person.educations)
print(person.experiences)
print(person.interests)
print(person.location)
print(person.linkedin_url)
print(person.also_viewed_urls)
except NoSuchElementException as e:
print("ERROR MESSAGE FOR DEVELOPER: ", e)
Output:
Reham John None tianshi international [None at b'sirsyed college of
commerece' from None to None, b'maric from govr high school and bcom
from sir syed college' at b'govt high school, sir syed college
jaranwala' from 2006 to 2007] [b'tianshi international' at None from
None to None for None based at None] [] Germany
https://www.linkedin.com/in/reham-john-498671ba/
[] Amjad John None tianshi international [None at b'sirsyed college of
commerece' from None to None, b'maric from govr high school and bcom
from sir syed college' at b'govt high school, sir syed college
jaranwala' from 2006 to 2007] [b'tianshi international' at None from
None to None for None based at None, b'CEO' at None from None to None
for None based at None, b'Chief Executive Officer' at b'John Group of
companies' from 2000 – to for 20 yrs based at Islamabad] [] Federal
Capial &AJK, Pakistan
https://www.linkedin.com/in/amjad-john-7033bb17a/
[]
Two different on different companies but it will show previous person experience

You can solve this by providing the following arguments to the Person function:
Person(
linkedin_url=None,
name=None,
about=[],
experiences=[],
educations=[],
interests=[],
accomplishments=[],
company=None,
job_title=None,
driver=None,
scrape=True
)
This will ensure that you get different values for each iteration/Person.

Related

How do I add lines to a key and different lines as values?

So I start put with a file that lists title, actor, title, actor, etc.
12 Years a Slave
Topsy Chapman
12 Years a Slave
Devin Maurice Evans
12 Years a Slave
Brad Pitt
12 Years a Slave
Jay Huguley
12 Years a Slave
Devyn A. Tyler
12 Years a Slave
Willo Jean-Baptiste
American Hustle
Christian Bale
American Hustle
Bradley Cooper
American Hustle
Amy Adams
American Hustle
Jeremy Renner
American Hustle
Jennifer Lawrence
I need to make a dictionary that looks like what's below and lists all actors in the movie
{'Movie Title': ['All actors'], 'Movie Title': ['All Actors]}
So far I only have this
d = {}
with open(file), 'r') as f:
for key in f:
d[key.strip()] = next(f).split()
print(d)

Using a defaultdict is usually a better choice:
from collections import defaultdict
data = defaultdict(list)
with open("filename.txt", 'r') as f:
stripped = map(str.strip, f)
for movie, actor in zip(stripped, stripped):
data[movie].append(actor)
print(data)

So you need to switch between reading the title and reading the actor from the input data. You also need to store the title, so you can use it in the actor line.
You can use the setting of the title for switching between reading the title and reading the actor.
Some key checking and you have working logic.
# pretty printer to make the output nice
from pprint import pprint
data = """ 12 Years a Slave
Topsy Chapman
12 Years a Slave
Devin Maurice Evans
12 Years a Slave
Brad Pitt
12 Years a Slave
Jay Huguley
12 Years a Slave
Devyn A. Tyler
12 Years a Slave
Willo Jean-Baptiste
American Hustle
Christian Bale
American Hustle
Bradley Cooper
American Hustle
Amy Adams
American Hustle
Jeremy Renner
American Hustle
Jennifer Lawrence"""
result = {}
title = None
for line in data.splitlines():
# clean here once
line = line.strip()
if not title:
# store the title
title = line
else:
# check if title already exists
if title in result:
# if yes, append actor
result[title].append(line)
else:
# if no, create it with new list for actors
# and of course, add the current line as actor
result[title] = [line]
# reset title to None
title = None
pprint(result)
output
{'12 Years a Slave': ['Topsy Chapman',
'Devin Maurice Evans',
'Brad Pitt',
'Jay Huguley',
'Devyn A. Tyler',
'Willo Jean-Baptiste'],
'American Hustle': ['Christian Bale',
'Bradley Cooper',
'Amy Adams',
'Jeremy Renner',
'Jennifer Lawrence']}
EDIT
when reading from a file, you need to do it slightly different.
from pprint import pprint
result = {}
title = None
with open("somefile.txt") as infile:
for line in infile.read().splitlines():
line = line.strip()
if not title:
title = line
else:
if title in result:
result[title].append(line)
else:
result[title] = [line]
title = None
pprint(result)

BeautifulSoup Data Scraping : Unable to fetch correct information from the page

I am trying to scrape data from:-
https://www.canadapharmacy.com/
below are a few pages that I need to scrape:-
https://www.canadapharmacy.com/products/abilify-tablet
https://www.canadapharmacy.com/products/accolate
https://www.canadapharmacy.com/products/abilify-mt
I need all the information from the page. I wrote the below code:-
base_url = 'https://www.canadapharmacy.com'
data = []
for i in tqdm(range(len(medicine_url))):
r = requests.get(base_url+medicine_url[i])
soup = BeautifulSoup(r.text,'lxml')
# Scraping medicine Name
try:
main_name = (soup.find('h1',{"class":"mn"}).text.lstrip()).rstrip()
except:
main_name = None
try:
sec_name = (soup.find('div',{"class":"product-name"}).find('h3').text.lstrip()).rstrip()
except:
sec_name = None
try:
generic_name = (soup.find('div',{"class":"card product generic strength equal"}).find('div').find('h3').text.lstrip()).rstrip()
except:
generic_name = None
# Description
try:
des1 = soup.find('div',{"class":"answer expanded"}).find_all('p')[1].text
except:
des1 = ''
try:
des2 = soup.find('div',{"class":"answer expanded"}).find('ul').text
except:
des2 = ''
try:
des3 = soup.find('div',{"class":"answer expanded"}).find_all('p')[2].text
except:
des3 = ''
desc = (des1+des2+des3).replace('\n',' ')
#Directions
try:
dir1 = soup.find('div',{"class":"answer expanded"}).find_all('h4')[1].text
except:
dir1 = ''
try:
dir2 = soup.find('div',{"class":"answer expanded"}).find_all('p')[5].text
except:
dir2 = ''
try:
dir3 = soup.find('div',{"class":"answer expanded"}).find_all('p')[6].text
except:
dir3 = ''
try:
dir4 = soup.find('div',{"class":"answer expanded"}).find_all('p')[7].text
except:
dir4 = ''
directions = dir1+dir2+dir3+dir4
#Ingredients
try:
ing = soup.find('div',{"class":"answer expanded"}).find_all('p')[9].text
except:
ing = None
#Cautions
try:
c1 = soup.find('div',{"class":"answer expanded"}).find_all('h4')[3].text
except:
c1 = None
try:
c2 = soup.find('div',{"class":"answer expanded"}).find_all('p')[11].text
except:
c2 = ''
try:
c3 = soup.find('div',{"class":"answer expanded"}).find_all('p')[12].text #//div[#class='answer expanded']//p[2]
except:
c3 = ''
try:
c4 = soup.find('div',{"class":"answer expanded"}).find_all('p')[13].text
except:
c4 = ''
try:
c5 = soup.find('div',{"class":"answer expanded"}).find_all('p')[14].text
except:
c5 = ''
try:
c6 = soup.find('div',{"class":"answer expanded"}).find_all('p')[15].text
except:
c6 = ''
caution = (c1+c2+c3+c4+c5+c6).replace('\xa0','')
#Side Effects
try:
se1 = soup.find('div',{"class":"answer expanded"}).find_all('h4')[4].text
except:
se1 = ''
try:
se2 = soup.find('div',{"class":"answer expanded"}).find_all('p')[18].text
except:
se2 = ''
try:
se3 = soup.find('div',{"class":"answer expanded"}).find_all('ul')[1].text
except:
se3 = ''
try:
se4 = soup.find('div',{"class":"answer expanded"}).find_all('p')[19].text
except:
se4 = ''
try:
se5 = soup.find('div',{"class":"post-author-bio"}).text
except:
se5 = ''
se = (se1 + se2 + se3 + se4 + se5).replace('\n',' ')
for j in soup.find('div',{"class":"answer expanded"}).find_all('h4'):
if 'Product Code' in j.text:
prod_code = j.text
#prod_code = soup.find('div',{"class":"answer expanded"}).find_all('h4')[5].text #//div[#class='answer expanded']//h4
pharma = {"primary_name":main_name,
"secondary_name":sec_name,
"Generic_Name":generic_name,
"Description":desc,
"Directions":directions,
"Ingredients":ing,
"Caution":caution,
"Side_Effects":se,
"Product_Code":prod_code}
data.append(pharma)
But, each page is having different positions for the tags hence not giving correct data. So, I tried:-
soup.find('div',{"class":"answer expanded"}).find_all('h4')
which gives me the output:-
[<h4>Description </h4>,
<h4>Directions</h4>,
<h4>Ingredients</h4>,
<h4>Cautions</h4>,
<h4>Side Effects</h4>,
<h4>Product Code : 5513 </h4>]
I want to create a data frame where the description contains all the information given in the description, directions contain all the information of directions given on the web page.
for i in soup.find('div',{"class":"answer expanded"}).find_all('h4'):
if 'Description' in i.text:
print(soup.find('div',{"class":"answer expanded"}).findAllNext('p'))
but it prints all the after the soup.find('div',{"class":"answer expanded"}).find_all('h4'). but I want only the tags are giving me the description of the medicine and no others.
Can anyone suggest how to do this? Also, how to scrape the rate table from the page as it gives me values in unappropriate fashion?

You can try the next working example:
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
r = requests.get('https://www.canadapharmacy.com/products/abilify-tablet')
soup = BeautifulSoup(r.text,"lxml")
try:
card = ''.join([x.get_text(' ',strip=True) for x in soup.select('div.answer.expanded')])
des = card.split('Directions')[0].replace('Description','')
#print(des)
drc = card.split('Directions')[1].split('Ingredients')[0]
#print(drc)
ingre= card.split('Directions')[1].split('Ingredients')[1].split('Cautions')[0]
#print(ingre)
cau=card.split('Directions')[1].split('Ingredients')[1].split('Cautions')[1].split('Side Effects')[0]
#print(cau)
se= card.split('Directions')[1].split('Ingredients')[1].split('Cautions')[1].split('Side Effects')[1]
#print(se)
except:
pass
data.append({
'Description':des,
'Directions':drc,
'Ingredients':ingre,
'Cautions':cau,
'Side Effects':se
})
print(data)
# df = pd.DataFrame(data)
# print(df)
Output:
[{'Description': " Abilify Tablet (Aripiprazole) Abilify (Aripiprazole) is a medication prescribed to treat or manage different conditions, including: Agitation associated with schizophrenia or bipolar mania (injection formulation only) Irritability associated with autistic disorder Major depressive disorder , adjunctive treatment Mania and mixed episodes associated with Bipolar I disorder Tourette's disorder Schizophrenia Abilify works by activating different neurotransmitter receptors located in brain cells. Abilify activates D2 (dopamine) and 5-HT1A (serotonin) receptors and blocks 5-HT2A (serotonin) receptors. This combination of receptor activity is responsible for the treatment effects of Abilify. Conditions like schizophrenia, major depressive disorder, and bipolar disorder are caused by neurotransmitter imbalances in the brain. Abilify helps to correct these imbalances and return the normal functioning of neurons. ", 'Directions': ' Once you are prescribed and buy Abilify, then take Abilify exactly as prescribed by your
doctor. The dose will vary based on the condition that you are treating. The starting dose of Abilify ranges from 2-15 mg once daily, and the recommended dose for most conditions is between 5-15 mg once daily. The maximum dose is 30 mg once daily. Take Abilify with or without food. ', 'Ingredients': ' The active ingredient in Abilify medication is aripiprazole . ', 'Cautions': ' Abilify and other antipsychotic medications have been associated with an increased risk of death in elderly patients with dementia-related psychosis. When combined with other dopaminergic agents, Abilify can increase the risk of neuroleptic malignant syndrome. Abilify can cause metabolic changes and in some cases can induce high blood sugar in people with and without diabetes . Abilify can also weight gain and increased risk of dyslipidemia. Blood glucose should be monitored while taking Abilify. Monitor for low blood pressure and heart rate while taking Abilify; it can cause orthostatic hypertension which may lead to dizziness or fainting. Use with caution in patients with a history of seizures. ', 'Side Effects': ' The side effects of Abilify vary greatly depending
on what condition is being treated, what other medications are being used concurrently, and what dose is being taken. Speak with your doctor or pharmacist for a full list of side effects that apply to you. Some of the most common side effects include: Akathisia Blurred vision Constipation Dizziness Drooling Extrapyramidal disorder Fatigue Headache Insomnia Nausea Restlessness Sedation Somnolence Tremor Vomiting Buy Abilify online from Canada Pharmacy . Abilify can be purchased online with a valid prescription from a doctor. About Dr. Conor Sheehy (Page Author) Dr. Sheehy (BSc Molecular Biology, PharmD) works a clinical pharmacist specializing in cardiology, oncology, and ambulatory care. He’s a board-certified pharmacotherapy specialist (BCPS), and his experience working one-on-one with patients to fine tune their medication and therapy plans for optimal results makes him a valuable subject matter expert for our pharmacy. Read More.... IMPORTANT NOTE: The above information is intended to increase awareness of health information
and does not suggest treatment or diagnosis. This information is not a substitute for individual medical attention and should not be construed to indicate that use of the drug is safe, appropriate, or effective for you. See your health care professional for medical advice and treatment. Product Code : 5513'}]

Python3 Get URL creation_date, updated_date and expired_date

I have a URL DataFrame. My coding aims to use machine learning to classify the url into benign or malicious.
I wanna use Host-Based features to get the the url creation_date, last_updated_date and expired_date with whois package. But it shows error.
Could somebody help me to fix it?
Here is my code and the error as following.
# URL DataFrame
URL Lable
0 http://ovaismirza-politicalthoughts.blogspot.com/ 0
1 http://www.bluemoontea.com/ 0
2 http://www.viettiles.com/public/default/ckedit... 1
3 http://173.212.217.250/hescientiststravelled/o... 1
4 http://www.hole-in-the-wall.com/ 0
### Code
date = []
for i in range(len(df)):
item = df["URL"].loc[i]
domain = urlparse(item).netloc
cr = whois.query(domain).creation_date
up = whois.query(domain).last_updated
exp = whois.query(domain).expiration_date
if cr is not None and up is not None and exp is not None:
date.append(0)
else:
date.append(1)
### ErrorException
Traceback (most recent call last)
<ipython-input-26-0d7930e66020> in <module>
3 item = df["URL"].loc[i]
4 domain = urlparse(item).netloc
----> 5 cr = whois.query(domain).creation_date
6 up = whois.query(domain).last_updated
7 exp = whois.query(domain).expiration_date
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/whois/__init__.py in query(domain, force, cache_file, slow_down, ignore_returncode)
48
49 while 1:
---> 50 pd = do_parse(do_query(d, force, cache_file, slow_down, ignore_returncode), tld)
51 if (not pd or not pd['domain_name'][0]) and len(d) > 2: d = d[1:]
52 else: break
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/whois/_1_query.py in do_query(dl, force, cache_file, slow_down, ignore_returncode)
42 CACHE[k] = (
43 int(time.time()),
---> 44 _do_whois_query(dl, ignore_returncode),
45 )
46 if cache_file: cache_save(cache_file)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/whois/_1_query.py in _do_whois_query(dl, ignore_returncode)
59 r = p.communicate()[0]
60 r = r.decode() if PYTHON_VERSION == 3 else r
---> 61 if not ignore_returncode and p.returncode != 0: raise Exception(r)
62 return r
63
Exception: whois: connect(): Operation timed out
% IANA WHOIS server
% for more information on IANA, visit http://www.iana.org
% This query returned 1 object
refer: whois.verisign-grs.com
domain: COM
organisation: VeriSign Global Registry Services
address: 12061 Bluemont Way
address: Reston Virginia 20190
address: United States
contact: administrative
name: Registry Customer Service
organisation: VeriSign Global Registry Services
address: 12061 Bluemont Way
address: Reston Virginia 20190
address: United States
phone: +1 703 925-6999
fax-no: +1 703 948 3978
e-mail: info#verisign-grs.com
contact: technical
name: Registry Customer Service
organisation: VeriSign Global Registry Services
address: 12061 Bluemont Way
address: Reston Virginia 20190
address: United States
phone: +1 703 925-6999
fax-no: +1 703 948 3978
e-mail: info#verisign-grs.com
nserver: A.GTLD-SERVERS.NET 192.5.6.30 2001:503:a83e:0:0:0:2:30
nserver: B.GTLD-SERVERS.NET 192.33.14.30 2001:503:231d:0:0:0:2:30
nserver: C.GTLD-SERVERS.NET 192.26.92.30 2001:503:83eb:0:0:0:0:30
nserver: D.GTLD-SERVERS.NET 192.31.80.30 2001:500:856e:0:0:0:0:30
nserver: E.GTLD-SERVERS.NET 192.12.94.30 2001:502:1ca1:0:0:0:0:30
nserver: F.GTLD-SERVERS.NET 192.35.51.30 2001:503:d414:0:0:0:0:30
nserver: G.GTLD-SERVERS.NET 192.42.93.30 2001:503:eea3:0:0:0:0:30
nserver: H.GTLD-SERVERS.NET 192.54.112.30 2001:502:8cc:0:0:0:0:30
nserver: I.GTLD-SERVERS.NET 192.43.172.30 2001:503:39c1:0:0:0:0:30
nserver: J.GTLD-SERVERS.NET 192.48.79.30 2001:502:7094:0:0:0:0:30
nserver: K.GTLD-SERVERS.NET 192.52.178.30 2001:503:d2d:0:0:0:0:30
nserver: L.GTLD-SERVERS.NET 192.41.162.30 2001:500:d937:0:0:0:0:30
nserver: M.GTLD-SERVERS.NET 192.55.83.30 2001:501:b1f9:0:0:0:0:30
ds-rdata: 30909 8 2 E2D3C916F6DEEAC73294E8268FB5885044A833FC5459588F4A9184CFC41A5766
whois: whois.verisign-grs.com
status: ACTIVE
remarks: Registration information: http://www.verisigninc.com
created: 1985-01-01
changed: 2017-10-05
source: IANA
Domain Name: VIETTILES.COM
Registry Domain ID: 1827514943_DOMAIN_COM-VRSN
Registrar WHOIS Server: whois.pavietnam.vn
Registrar URL: http://www.pavietnam.vn
Updated Date: 2018-09-07T01:13:32Z
Creation Date: 2013-09-14T04:35:12Z
Registry Expiry Date: 2019-09-14T04:35:12Z
Registrar: P.A. Viet Nam Company Limited
Registrar IANA ID: 1649
Registrar Abuse Contact Email: abuse#pavietnam.vn
Registrar Abuse Contact Phone: +84.19009477
Domain Status: clientTransferProhibited https://icann.org/epp#clientTransferProhibited
Name Server: NS1.PAVIETNAM.VN
Name Server: NS2.PAVIETNAM.VN
Name Server: NSBAK.PAVIETNAM.NET
DNSSEC: unsigned
URL of the ICANN Whois Inaccuracy Complaint Form: https://www.icann.org/wicf/
Last update of whois database: 2018-12-25T13:33:54Z <<<
By the way, can I use other methods to get the URL creation_date, updated_date and expired_date instead of Whois in Python3?
Thanks in advance!

Get information from a xml imdb response with no tags

I'm working on a movie data base, getting responses from imdb. I'm getting the response in a xml format, but it has no tags, just the information mixed. How can I get each of the data in there?
Here's how the respone shows up:
<?xml version="1.0" encoding="UTF-8"?><root response="True">
<movie title="Batman" year="1989" rated="PG-13" released="23 Jun 1989" runtime="126 min" genre="Action, Adventure" director="Tim Burton" writer="Bob Kane (Batman characters), Sam Hamm (story), Sam Hamm (screenplay), Warren Skaaren (screenplay)" actors="Michael Keaton, Jack Nicholson, Kim Basinger, Robert Wuhl" plot="Gotham City. Crime boss Carl Grissom (Jack Palance) effectively runs the town but there's a new crime fighter in town - Batman (Michael Keaton). Grissom's right-hand man is Jack Napier (Jack Nicholson), a brutal man who is not entirely sane... After falling out between the two Grissom has Napier set up with the Police and Napier falls to his apparent death in a vat of chemicals. However, he soon reappears as The Joker and starts a reign of terror in Gotham City. Meanwhile, reporter Vicki Vale (Kim Basinger) is in the city to do an article on Batman. She soon starts a relationship with Batman's everyday persona, billionaire Bruce Wayne." language="English, French, Spanish" country="USA, UK" awards="Won 1 Oscar. Another 8 wins & 26 nominations." poster="https://m.media-amazon.com/images/M/MV5BMTYwNjAyODIyMF5BMl5BanBnXkFtZTYwNDMwMDk2._V1_SX300.jpg" metascore="69" imdbRating="7.6" imdbVotes="302,842" imdbID="tt0096895" type="movie" />
</root>

Here is my answer to your question
xmlRaw="""< ?xml
version = "1.0"
encoding = "UTF-8"? > < root
response = "True" >
< movie
title = "Batman"
year = "1989"
rated = "PG-13"
released = "23 Jun 1989"
runtime = "126 min"
genre = "Action, Adventure"
director = "Tim Burton"
writer = "Bob Kane (Batman characters), Sam Hamm (story), Sam Hamm (screenplay), Warren Skaaren (screenplay)"
actors = "Michael Keaton, Jack Nicholson, Kim Basinger, Robert Wuhl"
plot = "Gotham City. Crime boss Carl Grissom (Jack Palance) effectively runs the town but there's a new crime fighter in town - Batman (Michael Keaton). Grissom's right-hand man is Jack Napier (Jack Nicholson), a brutal man who is not entirely sane... After falling out between the two Grissom has Napier set up with the Police and Napier falls to his apparent death in a vat of chemicals. However, he soon reappears as The Joker and starts a reign of terror in Gotham City. Meanwhile, reporter Vicki Vale (Kim Basinger) is in the city to do an article on Batman. She soon starts a relationship with Batman's everyday persona, billionaire Bruce Wayne."
language = "English, French, Spanish"
country = "USA, UK"
awards = "Won 1 Oscar. Another 8 wins & 26 nominations."
poster = "https://m.media-amazon.com/images/M/MV5BMTYwNjAyODIyMF5BMl5BanBnXkFtZTYwNDMwMDk2._V1_SX300.jpg"
metascore = "69"
imdbRating = "7.6"
imdbVotes = "302,842"
imdbID = "tt0096895"
type = "movie" / >
< / root >"""
def getValue(xml, value):
textString = xmlRaw.split('\n')
for line in textString:
if value in line:
returnData = line
return returnData
print (getValue(xmlRaw, 'title'))
print (getValue(xmlRaw, 'year'))
print (getValue(xmlRaw, 'rated'))
print (getValue(xmlRaw, 'released'))
#add more as you need the data

Python Replace multiple words in the list

Below is the code, which i was trying out, but i am not getting the expected results.
import re
def multiwordReplace(text, wordDic):
"""
take a text and replace words that match a key in a dictionary with
the associated value, return the changed text
"""
rc = re.compile('|'.join(map(re.escape, wordDic)))
def translate(match):
return wordDic[match.group(0)]
return rc.sub(translate, text)
wordDic = {
'ANGLO': 'ANGLO IRISH BANK',
'ANGLO IRISH': 'ANGLO IRISH BANK'
}
def replace(match):
return wordDic[match.group(0)]
#return ''.join(y for y in match.group(0).split())
str1 = {'ANGLO IRISH CORP PLC - THIS FOLLOWS THE BANK NATIONALIZATION BY THE GOVT OF THE REPUBLIC OF IRELAND'
'ANGLO CORP PLC - THIS FOLLOWS THE BANKS NATIONALIZATION BY THE GOVT OF THE REPUBLIC OF IRELAND'}
for item in str1:
str2 = multiwordReplace(item, wordDic)
print str2
print re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in wordDic),
replace, item)
Output:
ANGLO IRISH BANK IRISH CORP PLC - THIS FOLLOWS THE BANK NATIONALIZATION BY THE GOVT OF THE REPUBLIC OF IRELAND
ANGLO IRISH BANK CORP PLC - THIS FOLLOWS THE BANKS NATIONALIZATION BY THE GOVT OF THE REPUBLIC OF IRELAND
the first one has to give only 'ANGLO IRISH BANK' and not ANGLO IRISH BANK IRISH.

Sort so that the longest possible match appears first.
longest_first = sorted(wordDic, key=len, reverse=True)
rc = re.compile('|'.join(map(re.escape, longest_first)))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why linkedin_scraper scrape same LinkedIn profile on different URL - python

Related

How do I add lines to a key and different lines as values?

BeautifulSoup Data Scraping : Unable to fetch correct information from the page

Python3 Get URL creation_date, updated_date and expired_date

Get information from a xml imdb response with no tags

Python Replace multiple words in the list

Categories

Resources