How to scrape data from multiple pages using beautifulsoup

How to scrape data from multiple pages using beautifulsoup - python

I am try to scrape multiple page but they give me nothing kindly help me to resolve these issue
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
for page in range(1,2 ):
response = requests.get("https://www.avbuyer.com/aircraft/private-jets={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(response.content, 'html.parser')
postings = soup.find_all('div', class_ = 'listing-item premium')
for post in postings:
link = post.find('a', class_ = 'more-info').get('href')
link_full = 'https://www.avbuyer.com'+ link
plane = post.find('h2', class_ = 'item-title').text
price = post.find('div', class_ = 'price').text
location = post.find('div', class_ = 'list-item-location').text
print(location)

problem was in page = that should be page- .Now your code is working fine.
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0'}
for page in range(1, 2):
response = requests.get("https://www.avbuyer.com/aircraft/private-jets/page-{page}".format(page=page),headers=headers,)
soup = BeautifulSoup(response.content, 'html.parser')
postings = soup.find_all('div', class_='listing-item premium')
for post in postings:
link = post.find('a', class_='more-info').get('href')
link_full = 'https://www.avbuyer.com' + link
plane = post.find('h2', class_='item-title').text
price = post.find('div', class_='price').text
location = post.find('div', class_='list-item-location').text
print(location)
Output:
North America + Canada, United States - MD, For Sale by Avpro Inc.
North America + Canada, United States - WI, For Sale by Lone Mountain Aircraft Sales
North America + Canada, United States - MD, For Sale by Avpro Inc.
North America + Canada, United States - MD, For Sale by Avpro Inc.
Europe, Monaco, For Sale by Global Jet Monaco
South America, Puerto Rico, For Sale by JetHQ
North America + Canada, United States - NE, For Sale by Duncan Aviation
North America + Canada, United States - DE, For Sale by Leading Edge Aviation Solutions
North America + Canada, United States - TX, For Sale by Par Avion Ltd.
North America + Canada, United States - MD, For Sale by Avpro Inc.
Europe, Switzerland, For Sale by Jetcraft
Europe, United Kingdom - England, For Sale by Jets4UDirect Ltd
North America + Canada, United States - MD, For Sale by Avpro Inc.
North America + Canada, United States - MT, For Sale by SkyWorld Aviation
North America + Canada, United States - MD, For Sale by Avpro Inc.
North America + Canada, United States - AZ, For Sale by Hatt & Associates
Europe, Switzerland, For Sale by Jetcraft
North America + Canada, United States - MD, For Sale by Avpro Inc.
North America + Canada, United States - MD, For Sale by Avpro Inc.
North America + Canada, United States - MD, For Sale by Avpro Inc.

Related

Web Scraping & BeautifulSoup <li> parsing

I'm just learning web scraping & want to output the result of this website to a csv file
https://www.avbuyer.com/aircraft/private-jets
but am struggling with year, sn & time field in the below code -
when I put "soup" in place of "post" it works but not when I want to put them together
any help would be much appreciated
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.avbuyer.com/aircraft/private-jets'
page = requests.get(url)
page
soup = BeautifulSoup(page.text, 'lxml')
soup
df = pd.DataFrame({'Plane':[''], 'Year':[''], 'S/N':[''], 'Total Time':[''], 'Price':[''], 'Location':[''], 'Description':[''], 'Tag':[''], 'Last updated':[''], 'Link':['']})
while True:
postings = soup.find_all('div', class_ = 'listing-item premium')
for post in postings:
try:
link = post.find('a', class_ = 'more-info').get('href')
link_full = 'https://www.avbuyer.com'+ link
plane = post.find('h2', class_ = 'item-title').text
price = post.find('div', class_ = 'price').text
location = post.find('div', class_ = 'list-item-location').text
year = post.find_all('ul', class_ = 'fa-no-bullet clearfix')[2]
year.find_all('li')[0].text
sn = post.find('ul', class_ = 'fa-no-bullet clearfix')[2]
sn.find('li')[1].text
time = post.find('ul', class_ = 'fa-no-bullet clearfix')[2]
time.find('li')[2].text
desc = post.find('div', classs_ = 'list-item-para').text
tag = post.find('div', class_ = 'list-viewing-date').text
updated = post.find('div', class_ = 'list-update').text
df = df.append({'Plane':plane, 'Year':year, 'S/N':sn, 'Total Time':time, 'Price':price, 'Location':location,
'Description':desc, 'Tag':tag, 'Last updated':updated, 'Link':link_full}, ignore_index = True)
except:
pass
next_page = soup.find('a', {'rel':'next'}).get('href')
next_page_full = 'https://www.avbuyer.com'+next_page
next_page_full
url = next_page_full
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
df.to_csv('/Users/xxx/avbuyer.csv')

Try this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
response = requests.get('https://www.avbuyer.com/aircraft/private-jets')
soup = BeautifulSoup(response.content, 'html.parser')
postings = soup.find_all('div', class_ = 'listing-item premium')
temp=[]
for post in postings:
link = post.find('a', class_ = 'more-info').get('href')
link_full = 'https://www.avbuyer.com'+ link
plane = post.find('h2', class_ = 'item-title').text
price = post.find('div', class_ = 'price').text
location = post.find('div', class_ = 'list-item-location').text
t=post.find_all('div',class_='list-other-dtl')
for i in t:
data=[tup.text for tup in i.find_all('li')]
years=data[0]
s=data[1]
total_time=data[2]
temp.append([plane,price,location,link_full,years,s,total_time])
df=pd.DataFrame(temp,columns=["plane","price","location","link","Years","S/N","Totaltime"])
print(df)
output:
plane price location link Years S/N Totaltime
0 Dassault Falcon 2000LXS Make offer North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2021 S/N 377 Total Time 33
1 Cirrus Vision SF50 G1 Please call North America + Canada, United States - WI, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2018 S/N 0080 Total Time 615
2 Gulfstream IV Make offer North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 1990 S/N 1148 Total Time 6425
4 Boeing 787-8 Make offer Europe, Monaco, For Sale by Global Jet Monaco https://www.avbuyer.com/aircraft/private-jets/... Year 2010 S/N - Total Time 1
5 Hawker 4000 Make offer South America, Puerto Rico, For Sale by JetHQ https://www.avbuyer.com/aircraft/private-jets/... Year 2009 S/N RC-24 Total Time 2120
6 Embraer Legacy 500 Make offer North America + Canada, United States - NE, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2015 S/N 55000016 Total Time 2607
7 Dassault Falcon 2000LXS Make offer North America + Canada, United States - DE, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2015 S/N 300 Total Time 1909
8 Dassault Falcon 50EX Please call North America + Canada, United States - TX, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2002 S/N 320 Total Time 7091.9
9 Dassault Falcon 2000 Make offer North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2001 S/N 146 Total Time 6760
10 Bombardier Learjet 75 Make offer Europe, Switzerland, For Sale by Jetcraft https://www.avbuyer.com/aircraft/private-jets/... Year 2014 S/N 45-491 Total Time 1611
11 Hawker 800B Please call Europe, United Kingdom - England, For Sale by ... https://www.avbuyer.com/aircraft/private-jets/... Year 1985 S/N 258037 Total Time 9621
13 BAe Avro RJ100 Please call North America + Canada, United States - MT, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 1996 S/N E3282 Total Time 45996
14 Embraer Legacy 600 Make offer North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2007 S/N 14501014 Total Time 4328
15 Bombardier Challenger 850 Make offer North America + Canada, United States - AZ, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2003 S/N 7755 Total Time 12114.1
16 Gulfstream G650 Please call Europe, Switzerland, For Sale by Jetcraft https://www.avbuyer.com/aircraft/private-jets/... Year 2013 S/N 6047 Total Time 2178
17 Bombardier Learjet 55 Price: USD $995,000 North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 1982 S/N 020 Total Time 13448
18 Dassault Falcon 8X Please call North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2016 S/N 406 Total Time 1627
19 Hawker 800XP Price: USD $1,595,000 North America + Canada, United States - MD, Fo... https://www.avbuyer.com/aircraft/private-jets/... Year 2002 S/N 258578 Total Time 10169

Right now, your try-except clauses are not allowing you to see and debug the errors in your script. If you remove them, you will see:
IndexError: list index out of range in line 24. There are only two elements inside the list, and you are looking for the second one. Therefore, your line should be:
year = post.find_all('ul', class_ = 'fa-no-bullet clearfix')[1]
KeyError: 2 in line 26. You are using find(), which returns a <class 'bs4.element.Tag'> object, not a list. Here you want to use find_all() as you did in line 24. Same happens for line 28.
However, instead of using this expression three times, you should rather store the result in a variable and use it later.
AttributeError: 'NoneType' object has no attribute 'text' in line 31. There is a type, you wrote _classs.
AttributeError: 'NoneType' object has no attribute 'text' in line 32. There is nothing wrong with your code. Instead, there are some entries in the webpage that don't have this element. You should check if the find method gave you any result.
tag = post.find('div', class_ = 'list-viewing-date')
if tag:
tag = tag.text
else:
tag = None
You don't have a way out of your while loop. You should catch whenever the script cannot find a new next_page and add a break.
After changing all this, it worked for me to scrape the first page. I used:
Python 3.9.7
bs4 4.10.0
It is very important that you state what versions of Python and the libraries you are using.
Cheers!

ValueError: All arrays must be of the same length append data in data frame

import requests
from bs4 import BeautifulSoup
import pandas as pd
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
productlink=[]
n=[]
a=[]
re=[]
ra=[]
w=[]
r =requests.get('https://www.houzz.com/professionals/general-contractor')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='hz-pro-search-result__info')
for pro in tra:
name=pro.find('span',class_='mlm header-5 text-unbold').text
n.append(name)
address=pro.find('span',class_='hz-pro-search-result__location-info__text').text
a.append(address)
reviews=pro.find('span',class_='hz-star-rate__review-string').text
re.append(reviews)
rating=pro.find('span',class_='hz-star-rate__rating-number').text
ra.append(rating)
for links in tra:
for link in links.find_all('a',href=True)[2:]:
if link['href'].startswith('https://www.houzz.com/professionals/general-contractors'):
productlink.append(link['href'])
for link in productlink:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
for web in soup.find_all('a',attrs={'class':'sc-62xgu6-0 jxCcwv mwxddt-0 bSdLOV hui-link trackMe'}):
w.append(web['href'])
df = pd.DataFrame({'name':n,'address':a,'reviews':re,'rating':ra,'web':w})
print(df)
the code is working well when I am trying to append the data into the dataframe they show me the that all ValueError: All arrays must be of the same length how to append these data into data frame how to resolve these issue I am very thankful kindly if you help me in this matter
This my output :
Capital Remodeling Hanover, Maryland 21076, United States 409 Reviews 4.8
SOD Home Group 367 Santana Heights, Unit #3-3021, San Jose, California 95128, United States 238 Reviews 5.0
Innovative Construction Inc. 3040 Amwiler Rd, Suite B, Peachtree Corners, Georgia 30360, United States 100 Reviews 5.0
Baron Construction & Remodeling Co. Saratoga & Los Angeles, California 95070, United States 69 Reviews 4.8
Luxe Remodel 329 N. Wetherly Dr., Suite 205, Los Angeles, California 90211, United States 79 Reviews 4.9
California Home Builders & Remodeling Inc. STUDIO CITY, California 91604, United States 232 Reviews 5.0
Sneller Custom Homes and Remodeling, LLC 17018 Seven Pines Dr Ste 100, Spring, Texas 77379, United States 77 Reviews 4.9
123 Remodeling Inc. 5070 N. Kimberly Ave Suite C, Chicago, Illinois 60630, United States 83 Reviews 4.7
Professional builders & Remodeling, Inc 15335 Morrison St #325, Sherman Oaks, California 91403, United States 203 Reviews 5.0
Rudloff Custom Builders 896 Breezewood Lane, West Chester, Pennsylvania 19382, United States 111 Reviews 5.0
LAR Construction & Remodeling 6371 canby ave, Tarzana, California 91335, United States 191 Reviews 5.0
Erie Construction Mid West 4271 Monroe St., Toledo, Ohio 43606, United States 231 Reviews 4.8
Regal Construction & Remodeling Inc. 19537 � Ventura Blvd., Tarzana, California 91356, United States 96 Reviews 4.8
Mr. & Mrs. Construction & Remodeling 2570 N 1st street, ste 212, San Jose, California 95131, United States 75 Reviews 5.0
Bailey Remodeling and Construction LLC 201 Meridian Ave., Suite 201, Louisville, Kentucky 40207, United States 106 Reviews 5.0
https://www.houzz.com/trk/aHR0cDovL3d3dy5iYWlsZXlyZW1vZGVsLmNvbQ/2f005891e940e2c01021b57733580fa3/ue/NDU3NDcxNQ/a3be682e415d6c23590401e416ee1018

Make it as simple as possible and do not store the information from different loops in these bunch of lists, try to store them in one dict:
possible solution
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
r =requests.get('https://www.houzz.com/professionals/general-contractor')
soup=BeautifulSoup(r.content, 'html.parser')
tra = soup.find_all('div',class_='hz-pro-search-result__info')
data = []
for pro in tra:
name=pro.find('span',class_='mlm header-5 text-unbold').text
address=pro.find('span',class_='hz-pro-search-result__location-info__text').text
reviews=pro.find('span',class_='hz-star-rate__review-string').text
rating=pro.find('span',class_='hz-star-rate__rating-number').text
productlink.append(pro.find('a')['href'])
w = pro.find('a')['href']
data.append({'name':name,'address':address,'reviews':reviews,'rating':rating,'web':w})
for idx,item in enumerate(data):
r =requests.get(item['web'],headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
for web in soup.find_all('a',attrs={'class':'sc-62xgu6-0 jxCcwv mwxddt-0 bSdLOV hui-link trackMe'}):
data[idx]['web']=(web['href'])
df = pd.DataFrame(data)
df
Output
name address reviews rating web
0 Capital Remodeling Hanover, Maryland 21076, United States 409 Reviews 4.8 https://www.houzz.com/trk/aHR0cDovL3d3dy5jYXBp...
1 SOD Home Group 367 Santana Heights, Unit #3-3021, San Jose, C... 238 Reviews 5.0 https://www.houzz.com/trk/aHR0cHM6Ly9zb2RoZy5j...
2 Innovative Construction Inc. 3040 Amwiler Rd, Suite B, Peachtree Corners, G... 100 Reviews 5.0 https://www.houzz.com/trk/aHR0cHM6Ly9pbm5vdmF0...
3 Baron Construction & Remodeling Co. Saratoga & Los Angeles, California 95070, Unit... 69 Reviews 4.8 https://www.houzz.com/trk/aHR0cDovL3d3dy5iYXJv...
4 Luxe Remodel 329 N. Wetherly Dr., Suite 205, Los Angeles, C... 79 Reviews 4.9 https://www.houzz.com/professionals/general-co...
5 California Home Builders & Remodeling Inc. STUDIO CITY, California 91604, United States 232 Reviews 5.0 https://www.houzz.com/trk/aHR0cDovL3d3dy5teWNh...
6 Sneller Custom Homes and Remodeling, LLC 17018 Seven Pines Dr Ste 100, Spring, Texas 77... 77 Reviews 4.9 https://www.houzz.com/trk/aHR0cDovL3NuZWxsZXJj...
7 123 Remodeling Inc. 5070 N. Kimberly Ave Suite C, Chicago, Illinoi... 83 Reviews 4.7 https://www.houzz.com/trk/aHR0cHM6Ly8xMjNyZW1v...
8 Professional builders & Remodeling, Inc 15335 Morrison St #325, Sherman Oaks, Californ... 203 Reviews 5.0 https://www.houzz.com/trk/aHR0cDovL3d3dy5wcm9m...
9 Rudloff Custom Builders 896 Breezewood Lane, West Chester, Pennsylvani... 111 Reviews 5.0 https://www.houzz.com/trk/aHR0cDovL1J1ZGxvZmZj...
10 LAR Construction & Remodeling 6371 canby ave, Tarzana, California 91335, Uni... 191 Reviews 5.0 https://www.houzz.com/trk/aHR0cDovL3d3dy5sYXJy...
11 Erie Construction Mid West 4271 Monroe St., Toledo, Ohio 43606, United St... 231 Reviews 4.8 https://www.houzz.com/trk/aHR0cDovL3d3dy5lcmll...
12 Regal Construction & Remodeling Inc. 19537 ½ Ventura Blvd., Tarzana, California 913... 96 Reviews 4.8 https://www.houzz.com/trk/aHR0cDovL3JlZ2FscmVu...
13 Mr. & Mrs. Construction & Remodeling 2570 N 1st street, ste 212, San Jose, Californ... 75 Reviews 5.0 https://www.houzz.com/trk/aHR0cDovL3d3dy5NcmFu...
14 Bailey Remodeling and Construction LLC 201 Meridian Ave., Suite 201, Louisville, Kent... 106 Reviews 5.0 https://www.houzz.com/trk/aHR0cDovL3d3dy5iYWls...

Webscraping from a Wikipedia Table?

I am trying to get data from this Wikipedia Article containing a table of each National Park along with some details of each park. Changing the code from a similar tutorial I found, I was able to display the name and state of each park, through the area of the park is not working. I suspect that this is because the name and state are links in the Wikipedia article though I am not certain. How would I change my code to be able to display the area as well?
import requests
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States"
res = requests.get(URL).text
soup = BeautifulSoup(res,'html.parser')
for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
data = items.find_all(['th','td'])
try:
parkName = data[0].a.text
parkState = data[2].a.text
parkArea = data[4].span.text
except IndexError:pass
print("{} | {} | {}".format(parkName, parkState, parkArea))
Snippet of my Output

To get the text of the area, you can use .get_text() and then str.rsplit() to get only area in acres:
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/List_of_national_parks_of_the_United_States"
soup = BeautifulSoup(requests.get(url).content,'html.parser')
rows = iter(soup.select('.wikitable tr:has(td, th)'))
next(rows) # skip headers
for tr in rows:
name, _, state, _, area, *_ = tr.select('td, th')
name = name.get_text(strip=True)
state = state.a.get_text(strip=True)
area = area.get_text(strip=True).rsplit(maxsplit=2)[0]
print('{:<35}{:<25}{}'.format(name, state, area))
Prints:
Acadia Maine 49,076.63 acres
American Samoa American Samoa 8,256.67 acres
Arches Utah 76,678.98 acres
Badlands South Dakota 242,755.94 acres
Big Bend Texas 801,163.21 acres
Biscayne Florida 172,971.11 acres
Black Canyon of the Gunnison Colorado 30,779.83 acres
Bryce Canyon Utah 35,835.08 acres
Canyonlands Utah 337,597.83 acres
Capitol Reef Utah 241,904.50 acres
Carlsbad Caverns* New Mexico 46,766.45 acres
Channel Islands California 249,561.00 acres
Congaree South Carolina 26,476.47 acres
Crater Lake Oregon 183,224.05 acres
Cuyahoga Valley Ohio 32,571.88 acres
Death Valley California 3,408,406.73 acres
Denali Alaska 4,740,911.16 acres
Dry Tortugas Florida 64,701.22 acres
Everglades Florida 1,508,938.57 acres
Gates of the Arctic Alaska 7,523,897.45 acres
Gateway Arch Missouri 192.83 acres
Glacier Montana 1,013,125.99 acres
Glacier Bay Alaska 3,223,383.43 acres
Grand Canyon* Arizona 1,201,647.03 acres
Grand Teton Wyoming 310,044.36 acres
Great Basin Nevada 77,180.00 acres
Great Sand Dunes Colorado 107,341.87 acres
Great Smoky Mountains North Carolina 522,426.88 acres
Guadalupe Mountains Texas 86,367.10 acres
Haleakalā Hawaii 33,264.62 acres
Hawaiʻi Volcanoes Hawaii 325,605.28 acres
Hot Springs Arkansas 5,554.15 acres
Indiana Dunes Indiana 15,349.08 acres
Isle Royale Michigan 571,790.30 acres
Joshua Tree California 795,155.85 acres
Katmai Alaska 3,674,529.33 acres
Kenai Fjords Alaska 669,650.05 acres
Kings Canyon California 461,901.20 acres
Kobuk Valley Alaska 1,750,716.16 acres
Lake Clark Alaska 2,619,816.49 acres
Lassen Volcanic California 106,589.02 acres
Mammoth Cave Kentucky 54,011.91 acres
Mesa Verde* Colorado 52,485.17 acres
Mount Rainier Washington 236,381.64 acres
North Cascades Washington 504,780.94 acres
Olympic Washington 922,649.41 acres
Petrified Forest Arizona 221,390.21 acres
Pinnacles California 26,685.73 acres
Redwood* California 138,999.37 acres
Rocky Mountain Colorado 265,807.25 acres
Saguaro Arizona 91,715.72 acres
Sequoia California 404,062.63 acres
Shenandoah Virginia 199,223.77 acres
Theodore Roosevelt North Dakota 70,446.89 acres
Virgin Islands U.S. Virgin Islands 15,052.53 acres
Voyageurs Minnesota 218,222.35 acres
White Sands New Mexico 146,344.31 acres
Wind Cave South Dakota 33,970.84 acres
Wrangell–St. Elias* Alaska 8,323,146.48 acres
Yellowstone Wyoming 2,219,790.71 acres
Yosemite* California 761,747.50 acres
Zion Utah 147,242.66 acres

You can change this line:
parkArea = data[4].span.text
to this one if you want area in acres:
parkArea = data[4].text.split(' ')[0]
or this in km2:
parkArea = data[4].text.split(' ')[2]

Split a dataframe column on non-ASCII characters

This is a column with data and non ascii characters
Summary 1
United Kingdom - �â��Global Consumer Technology - �â��American Express
United Kingdom - �â��VP Technology - Founder - �â��Hogarth Worldwide
Aberdeen - �â��SeniorCore Analysis Specialist - �â��COREX Group
London, - �â��ED, Equit Technology, London - �â��Morgan Stanley
United Kingdom - �â��Chief Officer, Group Technology - �â��BP
How split them and save in different column
The code i used is:
import io
import pandas as pd
df = pd.read_csv("/home/vipul/Desktop/dataminer.csv", sep='\s*\+.*?-\s*')
df = df.reset_index()
df.columns = ["First Name", "Last Name", "Email", "Profile URL", "Summary 1", "Summary 2"]
df.to_csv("/home/vipul/Desktop/new.csv")

Say, you have a column in a series like this:
s
0 United Kingdom - �â��Global Consumer Technolog...
1 United Kingdom - �â��VP Technology - Founder -...
2 Aberdeen - �â��SeniorCore Analysis Specialist ...
3 London, - �â��ED, Equit Technology, London - �...
4 United Kingdom - �â��Chief Officer, Group Tech...
Name: Summary 1, dtype: object
Option 1
Expanding on this answer, you can split on non-ascii characters using str.split:
s.str.split(r'-\s*[^\x00-\x7f]+', expand=True)
0 1 2
0 United Kingdom Global Consumer Technology American Express
1 United Kingdom VP Technology - Founder Hogarth Worldwide
2 Aberdeen SeniorCore Analysis Specialist COREX Group
3 London, ED, Equit Technology, London Morgan Stanley
4 United Kingdom Chief Officer, Group Technology BP
Option 2
str.extractall + unstack:
s.str.extractall('([\x00-\x7f]+)')[0].str.rstrip(r'- ').unstack()
match 0 1 2
0 United Kingdom Global Consumer Technology American Express
1 United Kingdom VP Technology - Founder Hogarth Worldwide
2 Aberdeen SeniorCore Analysis Specialist COREX Group
3 London, ED, Equit Technology, London Morgan Stanley
4 United Kingdom Chief Officer, Group Technology BP

Another approach :
a
0 United Kingdom - �â��Global Consumer Technolog...
1 United Kingdom - �â��VP Technology - Founder -...
2 Aberdeen - �â��SeniorCore Analysis Specialist ...
3 London, - �â��ED, Equit Technology, London - �...
4 United Kingdom - �â��Chief Officer, Group Tech...
Use this function to extract assci char (where Unicode code point is superior to 128 ) using ord build-in function
def extract_ascii(x):
string_list = filter(lambda y : ord(y) < 128, x)
return ''.join(string_list)
and apply it to columns.
df1.a.apply(extract_ascii).str.split('-', expand=True)
here is the results :
0 1 2 3
0 United Kingdom Global Consumer Technology American Express None
1 United Kingdom VP Technology Founder Hogarth Worldwide
2 Aberdeen SeniorCore Analysis Specialist COREX Group None
3 London, ED, Equit Technology, London Morgan Stanley None
4 United Kingdom Chief Officer, Group Technology BP None

ValueError: array must not contain infs or NaNs

I have a csv file with data that is formatted for example, as follows(my data set is much much larger):
Image Id,URL,Latitude,Longitude,Address
10758202333,https://farm8.staticflickr.com/7408/10758202333_b6c29d93b1_q.jpg,51.482826,-0.167112,Cadogan Pier Chelsea Embankment Chelsea Royal Borough of Kensington and Chelsea London
23204019400,https://farm6.staticflickr.com/5688/23204019400_fb6879abe3_q.jpg,51.483106,-3.171207,Greggs Station Terrace Plasnewydd Cardiff Wales CF United Kingdom
11243511074,https://farm3.staticflickr.com/2818/11243511074_e1e2f1b99c_q.jpg,51.483297,-0.166534,Albert Bridge Chelsea Embankment Chelsea Royal Borough of Kensington and Chelsea London Greater London England SW3 5SY United Kingdom
22186903335,https://farm6.staticflickr.com/5697/22186903335_de53168305_q.jpg,51.483394,-3.176926,Greyfriars House Greyfriars Road Plasnewydd Cardiff Wales CF United Kingdom
22197179851,https://farm6.staticflickr.com/5786/22197179851_a818b17fae_q.jpg,51.483394,-3.176926,Greyfriars House Greyfriars Road Plasnewydd Cardiff Wales CF United Kingdom
22174235522,https://farm1.staticflickr.com/589/22174235522_3ffd1de2bb_q.jpg,51.483394,-3.176926,Greyfriars House Greyfriars Road Plasnewydd Cardiff Wales CF United Kingdom
22160755536,https://farm1.staticflickr.com/761/22160755536_8e23e9ed32_q.jpg,51.483394,-3.176926,Greyfriars House Greyfriars Road Plasnewydd Cardiff Wales CF United Kingdom
7667114130,https://farm8.staticflickr.com/7269/7667114130_117849250a_q.jpg,51.484563,-3.178181,Oybike Gorsedd Gardens Road Cathays Cardiff Wales CF United Kingdom
17136775881,https://farm9.staticflickr.com/8780/17136775881_363c2379ef_q.jpg,51.484608,-3.178845,Oybike Gorsedd Gardens Road Cathays Cardiff Wales CF United Kingdom
7110881411,https://farm9.staticflickr.com/8162/7110881411_f0fe3d7214_q.jpg,51.484644,-3.178099,Oybike Gorsedd Gardens Road Cathays Cardiff Wales CF United Kingdom
11718453936,https://farm4.staticflickr.com/3700/11718453936_148af12df6_q.jpg,51.484661,-3.179117,King Edward VII Avenue Cathays Cardiff Wales CF United Kingdom
20218915752,https://farm1.staticflickr.com/352/20218915752_4282c1f9b8_q.jpg,51.484683,-3.179147,King Edward VII Avenue Cathays Cardiff Wales CF United Kingdom
My code is as follows, I know it is not much but I simply want to be able to see a cluster plot figure showing up for now with centroids. However I am getting an error "ValueError: array must not contain infs or NaNs"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans, kmeans2, whiten
df = pd.read_csv('dataset_import.csv')
df.head()
coordinates = df.as_matrix(columns=['latitude', 'longitude'])
N = len(coordinates)
k = 100
i = 50
w = whiten(coordinates)
cluster_centroids, closest_centroids = kmeans2(w, k, iter=i, minit='points')
plt.figure(figsize=(10, 6), dpi=100)
plt.scatter(cluster_centroids[:,0], cluster_centroids[:,1], c='r', alpha=.7, s=150)
plt.scatter(w[:,0], w[:,1], c='k', alpha=.3, s=10)
plt.show()
Can anyone shed some light as to why this is happening, perhaps some of the fugures in my code are wrong etc. Thanks!

I have met the same problem with you, and I solved by wipe out the NaNs and infs.
def clean(serie):
output = serie[(np.isnan(serie) == False) & (np.isinf(serie) == False)]
return output
When I draw a plot, I use this function to clean my data in a temporary way, and it works now.
fig = plt.figure()
clean(data[col]).plot(kind='kde')
plt.show()
Or like this:
sns.kdeplot(clean(data[col]), bw=0.1, shade=True, legend=False)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape data from multiple pages using beautifulsoup - python

Related

Web Scraping & BeautifulSoup <li> parsing

ValueError: All arrays must be of the same length append data in data frame

Webscraping from a Wikipedia Table?

Split a dataframe column on non-ASCII characters

ValueError: array must not contain infs or NaNs

Categories

Resources