A year ago I learned some python in one of my classes but haven't had to use much since then so this may or may not be a simple question.
I'm trying to web-scrape the top grossing films of all time table from Box Office Mojo and I want to grab the rank, title, and gross for the top 10 films in the 2010s. I've been playing around in python and I can get the entire table into python but I don't know how to manipulate it from there, let alone write out a csv file. Any guidance/tips?
Here is what will print the entire table for me (the first few lines are copied from an old web-scraping assignment to get me started):
import bs4
import requests
from bs4 import BeautifulSoup as soup
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
page_html = requests.get(url, headers=headers)
page_soup = soup(page_html.text, "html.parser")
boxofficemojo_table = page_soup.find("div", {"class": "a-section imdb-scroll-table-inner"})
complete_table = boxofficemojo_table.get_text()
print(complete_table)`
You Can use pd.read_html for this.
import pandas as pd
Data = pd.read_html(r'https://www.boxofficemojo.com/chart/top_lifetime_gross/')
for data in Data:
data.to_csv('Data.csv', ',')
2.Using Bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests
URL = r'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
print('\n>> Exctracting Data using Beautiful Soup for :'+ URL)
try:
res = requests.get(URL)
except Exception as e:
print(repr(e))
print('\n<> URL present status Code = ',(res.status_code))
soup = BeautifulSoup(res.text,"lxml")
table = soup.find('table')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Data.dropna(axis = 0, how = 'all',inplace = True)
print(Data.head(10))
Data.to_csv('Table.csv')
Related
I'm relatively new to web scrapping , I used Selenium and beautiful soup to srcape data however I'm unable to, Can someone help get the table data from the following link or any way to download the CSV file in Python please?
'''
print("Start")
from nsetools import Nse
import pandas as pd
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import urllib.request
nse_web = "https://www.nseindia.com/market-data/new-52-week-high-low-equity-market"
req = urllib.request.Request(
nse_web,
data=None,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'}
)
#f = urllib.request.urlopen(req)
#nse_web = "https://www.nseindia.com/market-data/new-52-week-high-low-equity-market"
time.sleep(5)
html = urlopen(req)
print("open URL")
time.sleep(10)
bsObj = BeautifulSoup(html, features="lxml")
print("before_table")
time.sleep(15)
data = []
table = soup.find('table', attrs={'class':'common_table customHeight-table tableScroll alt_row w-100'})
print(table)
table_body = table.find('tbody')
print(table_body)
rows = table_body.find_all('tr')
Print(rows)
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
print(data)
print("process complete")
'''
It seems that you are able to get the data from the table as a list of rows. To make a csv file you need to use python csv module.
import csv
with open('result.csv', 'wb') as csv_file:
csv_obj = csv.writer(csv_file)
for row in data:
csv_obj.writerow(row)
you can obtain your results.csv file in your current directory.
I am trying to scrape IMDB https://www.imdb.com/chart/top/?ref_=nv_mv_250. I want to write a loop to enter each film page by getting all the href attributes. However, the html code returned by urlopen shows broken href attributes (ignoring everythin after the question mark). Here are my code and result. Thank you so much in advance.
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
table = bs.find('tbody',{'class':'lister-list'})
rows = table.find_all('tr')
for row in rows:
link = row.find('td',{'class':'titleColumn'}).find('a')['href']
print(link)
The result I get is something like this (ignoring everythin after the question mark)
/usr/local/Caskroom/miniconda/base/envs/web_scraping/bin/python /Users/gracezhou/Python/Python-Projects/scraping/imdb/test.py
/title/tt0111161/
/title/tt0068646/
/title/tt0071562/
/title/tt0468569/
/title/tt0050083/
/title/tt0108052/
/title/tt0167260/
/title/tt0110912/
/title/tt0060196/
/title/tt0120737/
/title/tt0137523/
/title/tt0109830/
I wan to receive somethins like this:
/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=CPK54FS6SPX9EDAPBSJT&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
Send the request using headers with User-Agent in it.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
res = requests.get("https://www.imdb.com/chart/top/?ref_=nv_mv_250", headers=headers)
soup = BeautifulSoup(res.content,"html.parser")
table = soup.find('tbody',{'class':'lister-list'})
rows = table.find_all('tr')
for row in rows:
link = row.find('td',{'class':'titleColumn'}).find('a')['href']
print(link)
Output:
/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
/title/tt0167260/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_7
/title/tt0110912/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_8
/title/tt0060196/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_9
/title/tt0120737/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_10
/title/tt0137523/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_11
/title/tt0109830/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_12
/title/tt1375666/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_13
/title/tt0080684/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_14
/title/tt0167261/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_15
/title/tt0133093/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_16
/title/tt0099685/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_17
/title/tt0073486/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_18
/title/tt0047478/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_19
/title/tt0114369/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_20
/title/tt0118799/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_21
/title/tt0317248/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_22
/title/tt0102926/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_23
/title/tt8503618/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_24
/title/tt0038650/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_25
/title/tt0076759/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_26
/title/tt0120815/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_27
/title/tt0245429/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_28
/title/tt6751668/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_29
/title/tt0120689/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_30
/title/tt0816692/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_31
/title/tt0110413/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_32
/title/tt0114814/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_33
/title/tt0056058/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_34
/title/tt0110357/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_35
/title/tt0088763/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_36
/title/tt0253474/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_37
/title/tt0103064/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_38
/title/tt0120586/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_39
/title/tt0027977/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_40
/title/tt0054215/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_41
/title/tt0172495/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_42
/title/tt0021749/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_43
/title/tt0407887/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_44
/title/tt1675434/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_45
/title/tt2582802/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_46
/title/tt0482571/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_47
/title/tt0095327/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_48
/title/tt0064116/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_49
/title/tt0034583/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_50
/title/tt0095765/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_51
/title/tt0047396/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_52
/title/tt0078748/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_53
/title/tt0078788/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_54
/title/tt0209144/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_55
/title/tt0082971/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_56
/title/tt0032553/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_57
/title/tt7286456/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_58
/title/tt0405094/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_59
/title/tt1853728/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_60
/title/tt0050825/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_61
/title/tt0081505/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_62
/title/tt0910970/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_63
/title/tt4154756/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_64
/title/tt0043014/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_65
/title/tt4633694/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_66
/title/tt0119698/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_67
/title/tt0057012/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_68
/title/tt0051201/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_69
/title/tt0364569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_70
/title/tt1345836/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_71
/title/tt0087843/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_72
/title/tt4154796/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_73
/title/tt0090605/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_74
/title/tt5311514/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_75
/title/tt2380307/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_76
/title/tt0169547/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_77
/title/tt0112573/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_78
/title/tt1187043/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_79
/title/tt0082096/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_80
/title/tt0114709/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_81
/title/tt0057565/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_82
/title/tt0086879/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_83
/title/tt0986264/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_84
/title/tt0086190/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_85
/title/tt0361748/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_86
/title/tt0105236/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_87
/title/tt0119217/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_88
/title/tt8267604/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_89
/title/tt0062622/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_90
/title/tt0180093/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_91
/title/tt8579674/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_92
/title/tt0052357/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_93
/title/tt0022100/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_94
/title/tt5074352/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_95
/title/tt0338013/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_96
/title/tt2106476/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_97
/title/tt0033467/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_98
/title/tt0093058/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_99
/title/tt0040522/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_100
/title/tt0066921/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_101
/title/tt0053125/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_102
/title/tt0012349/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_103
/title/tt0208092/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_104
/title/tt0045152/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_105
/title/tt0086250/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_106
/title/tt0075314/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_107
/title/tt0211915/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_108
/title/tt0056172/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_109
...
...
...
New to screen scraping here and this is my first time posting on stackoverflow. Aplogies in advance for any formatting errors in this post. Attempting to extract data from multiple pages with URL:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
For instance, page 1 is:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-1
Page 2:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-2
and so on...
My script is running without errors. However, my Pandas exported csv only contains 1 row with the first extracted value. At the time of this posting, the first value is:
14.01 Acres  Vestaburg, Montcalm County, MI$275,000
My intent is to create a spreadsheet with hundreds of rows that pull the property description from the URLs.
Here is my code:
import requests
from requests import get
from bs4 import BeautifulSoup
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
for page in range(1,900):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
else:
break
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
import pandas as pd
df = pd.DataFrame({'description': [desc]})
df.to_csv('test4.csv', encoding = 'utf-8')
I suspect the problem is with the line reading desc = container.getText(strip=True) and have tried changing the line but keep getting errors when running.
Any help is appreciated.
I believe the mistake is in the line:
desc = container.getText(strip=True)
Every time it loops, the value in desc is replaced, not added on. To add items into the list, do:
desc.append(container.getText(strip=True))
Also, since it is already a list, you can remove the brackets from the DataFrame creation like so:
df = pd.DataFrame({'description': desc})
The cause is that no data is being added in the loop, so only the final data is being saved. For testing purposes, this code is now on page 2, so please fix it.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
all_data = pd.DataFrame(index=[], columns=['description'])
for page in range(1,3):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
df = pd.DataFrame({'description': [desc]})
all_data = pd.concat([all_data, df], ignore_index=True)
else:
break
all_data.to_csv('test4.csv', encoding = 'utf-8')
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
I am working with lobbying data from opensecrets.org, in particular industry data. I want to have a time series of lobby expenditures for each industry going back since the 90's.
I want to web-scrape the data automatically. Urls where the data is have the following format:
https://www.opensecrets.org/lobby/indusclient.php?id=H04&year=2019
which are pretty easy to embed in a loop, the problem is that the data I need is not in an easy format in the webpage. It is inside a bar graph, and when I inspect the graph I do not know how to get the data since it is not in the html code. I am familiar with web-scraping in python when the data is in the html code, but in this case I am not sure how to proceed.
If there is an API, that your best bet as mentioned above. But the data is able to be parsed anyway provided you get the right url/query parameters:
I've managed to iterate through it with the links for you to grab each table. I stored it in a dictionary with the key being the Firm name, and the value being the table/data. You can change it up to anyway you'd like. Maybe just store as json, or save each as csv.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.opensecrets.org/lobby/indusclient.php?id=H04&year=2019'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
data = requests.get(url, headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
links = soup.find_all('a', href=True)
root_url = 'https://www.opensecrets.org/lobby/include/IMG_client_year_comp.php?'
links_dict = {}
for each in links:
if 'clientsum.php?' in each['href']:
w=1
firms = each.text
link = root_url + each['href'].split('?')[-1].split('&')[0].strip() + '&type=c'
links_dict[firms] = link
all_tables = {}
n=1
tot = len(links_dict)
for firms, link in links_dict.items():
print ('%s of %s ---- %s' %(n, tot, firms))
data = requests.get(link)
soup = BeautifulSoup(data.text, 'html.parser')
results = pd.DataFrame()
graph = soup.find_all('set')
for each in graph:
year = each['label']
total = each['value']
temp_df = pd.DataFrame([[year, total]], columns=['year','$mil'])
results = results.append(temp_df,sort=True).reset_index(drop=True)
all_tables[firms] = results
n+=1
*Output:**
Not going to print as there are 347 tables, but just so you see the structure:
A few months into python, and am having trouble scraping some information from tables using BeautifulSoup, any help would be appreciated. I am not getting any error codes, but instead just receiving no data from the table.
import bs4 as bs
import requests
resp = requests.get('https://www.thestreet.com/markets/gainers.html')
soup = bs.BeautifulSoup(resp.text, "lxml")
table = soup.find('table', {'id': 'nyseData'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[1].text
tickers.append(ticker)
Any help is much appreciated!
You are running in to a problem with the page not allowing certain user-agents from accessing their site. This can be fixed by setting a user-agent string in your requests header.
Your code with the user-agent added:
import bs4 as bs
import requests
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
resp = requests.get('https://www.thestreet.com/markets/gainers.html', headers=headers)
soup = bs.BeautifulSoup(resp.text,'lxml')
table = soup.find('table', {'id': 'nyseData'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[1].text
tickers.append(ticker)
print tickers
Output:
[u'QUOT', u'BCEI', u'ATEN', u'SKX', u'FBK', u'FBM', u'CGI', u'SDRL', u'ELLI', u'CELP', u'SXCP', u'CUB', u'GLF', u'SID', u'HBM', u'NE', u'CBG', u'PJT', u'VVI', u'ARL']