I'm newbie learning BeautifulSoup. May someone have a look at the following code? I'd like to scrape data from a website without any success. I'd like to create a dataframe with the sum of player arrivals per year and with a column of players average age.
dataframe repeating codes:
img dataframe error
my code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
anos_list = list(range(2005, 2018))
anos_lista = []
valor_contratos_lista = []
idade_média_lista = []
for ano_lista in anos_list:
url = 'https://www.transfermarkt.com/flamengo-rio-de-janeiro/transfers/verein/614/saison_id/'+ str(anos_list) + ''
page = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(page.text, 'html.parser')
tag_list = soup.tfoot.find_all('td')
valor = (tag_list[0].string)
idade = (tag_list[1].string)
ano = ano_lista
valor_contratos_lista.append(valor)
idade_media_lista.append(idade)
anos_lista.append(ano)
flamengo_df = pd.DataFrame({'Ano': ano_lista,
'Despesa com contratações':valor_contratos_lista,
'Média de idade': idade_média_lista
})
flamengo_df.to_csv('flamengo.csv', encoding = 'utf-8')`
Here's my approach:
Using Beautiful Soup + Regex:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
# Set min and max years as variables
min_year = 2005
max_year = 2019
year_range = list(range(min_year, 2019+1))
base_url = 'https://www.transfermarkt.com/flamengo-rio-de-janeiro/transfers/verein/614/saison_id/'
# Begin iterating
records = []
for year in year_range:
url = base_url+str(year)
# get the page
page = requests.get(url, headers={'User-Agent': 'Custom5'})
soup = BeautifulSoup(page.text, 'html.parser')
# I used the class of "responsive table"
tables = soup.find_all('div',{'class':'responsive-table'})
rows = tables[0].find_all('tr')
cells = [row.find_all('td', {'class':'zentriert'}) for row in rows]
# get variable names:
variables = [x.text for x in rows[0].find_all('th')]
variables_values = {x:[] for x in variables}
# get values
for row in rows:
values = [' '.join(x.text.split()) for x in row.find_all('td')]
values = [x for x in values if x!='']
if len(variables)< len(values):
values.pop(4)
values.pop(2)
for k,v in zip(variables_values.keys(), values):
variables_values[k].append(v)
num_pattern = re.compile('[0-9,]+')
to_float = lambda x: float(x) if x!='' else np.NAN
get_nums = lambda x: to_float(''.join(num_pattern.findall(x)).replace(',','.'))
# Add values to an individual record
rec = {
'Url':url,
'Year':year,
'Total Transfers':len(variables_values['Player']),
'Avg Age': np.mean([int(x) for x in variables_values['Age']]),
'Avg Cost': np.nanmean([get_nums(x) for x in variables_values['Fee'] if ('loan' not in x)]),
'Total Cost': np.nansum([get_nums(x) for x in variables_values['Fee'] if ('loan' not in x)]),
}
# Store record
records.append(rec)
Thereafter, initialize dataframe:
Of note, some of the numbers represent millions and would need to be adjusted for.
import pandas as pd
# Drop the URL
df = pd.DataFrame(records, columns=['Year','Total Transfers','Avg Age','Avg Cost','Total Cost'])
Year Total Transfers Avg Age Avg Cost Total Cost
0 2005 26 22.038462 2.000000 2.00
1 2006 32 23.906250 240.660000 1203.30
2 2007 37 22.837838 462.750000 1851.00
3 2008 41 22.926829 217.750000 871.00
4 2009 31 23.419355 175.000000 350.00
5 2010 46 23.239130 225.763333 1354.58
6 2011 47 23.042553 340.600000 1703.00
7 2012 45 24.133333 345.820000 1037.46
8 2013 36 24.166667 207.166667 621.50
9 2014 37 24.189189 111.700000 335.10
10 2015 49 23.530612 413.312000 2066.56
11 2016 41 23.341463 241.500000 966.00
12 2017 31 24.000000 101.433333 304.30
13 2018 18 25.388889 123.055000 738.33
14 2019 10 25.300000 NaN 0.00
Related
I need to write a script which sums values from each column (each column is a separate day). In addition, I want to separate the values into planned (blue color) and unplanned (red color). In the HTML code, I found that the unplanned values have a class name as "colBox cal-unplanned" and the planned values have a class name as "colBox cal-planned".
My code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
URL = 'http://gpi.tge.pl/zestawienie-ubytkow'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
# Here I tried to convert the data into a dataframe, but then you don't know which values are planned and which are unplanned
table = soup.find_all('table')
df = pd.read_html(str(table),header=2)[0]
# Here the values are correct, but they are collected from the whole table
sum = 0
for tr in soup.find_all('td', class_='colBox cal-unplanned'):
val = int(tr.text)
sum += val
print(sum)
for tr in soup.find_all('td', class_='colBox cal-planned'):
print(tr.text)
And here's my question. How can I select values from each column separately
Not sure there's a better way, but you can iterate through the table and store the planned and unplanned into separate values under the key of the column name. Then sum up those values and Then use that dictionary to convert to a dataframe.
But you're right, you lose that attribute in parsing it with .read_html().
This works, but not sure how robust it is for your situation.
import pandas as pd
import requests
from bs4 import BeautifulSoup
URL = 'http://gpi.tge.pl/zestawienie-ubytkow'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table')
data = {}
headers = [x.text.strip() for x in table.find_all('tr')[2].find_all('th')]
for header in headers:
data[header] = {'planned':[],
'unplanned':[]}
rows = table.find_all('tr')[3:]
for row in rows:
tds = row.find_all('td')[3:len(headers)+3]
for idx, value in enumerate(tds):
if value.has_attr("class"):
if 'cal-planned' in value['class']:
data[headers[idx]]['planned'].append(int(value.text.strip()))
elif 'cal-unplanned' in value['class']:
data[headers[idx]]['unplanned'].append(int(value.text.strip()))
sum_of_columns = {}
for col, values in data.items():
planned_sum = sum(values['planned'])
unplanned_sum = sum(values['unplanned'])
sum_of_columns[col] = {'planned':planned_sum,
'unplanned':unplanned_sum}
df = pd.DataFrame.from_dict(sum_of_columns,orient="columns" )
Output:
print(df.to_string())
Cz 14 Pt 15 So 16 N 17 Pn 18 Wt 19 Śr 20 Cz 21 Pt 22 So 23 N 24 Pn 25 Wt 26 Śr 27
planned 8808 8301 7750 6863 6069 6199 6069 5627 5627 5695 5695 5235 5235 5376
unplanned 2320 2020 2313 2783 950 950 950 950 950 950 950 910 910 910
So if I understood right, you want to work on single columns of your dataframe?
You could try to use this df['column_name'] to access a certain column of the df and then filter this column for the value you want to use like
df['column_name'] == filter_value
But then again I'm not sure I get your problem.
This helped me heaps with dataframe value selection.
Not sure if this is necessarily an issue for bs4, because I think the information is already in the DataFrame as a sum.
How to access?
Take a look at the tail() of your dataframe:
df.tail(3)
Example
import pandas as pd
URL = 'http://gpi.tge.pl/zestawienie-ubytkow'
df = pd.read_html(URL,header=2)[0]
df.tail(3).iloc[:,2:]
Output
Moc Osiągalna (MW) Cz 14 Pt 15 So 16 N 17 Pn 18 Wt 19 Śr 20 Cz 21 Pt 22 So 23 N 24 Pn 25 Wt 26 Śr 27
219 Planowane 11279 10604 8391 6863 6069 6432 6069 5627 5627 5695 5695 5235 5235 5376
220 Nieplanowane 5520 5620 2313 2783 950 950 950 950 950 950 950 910 910 910
221 Łącznie ubytki 16799 16224 10704 9646 7019 7382 7019 6577 6577 6645 6645 6145 6145 6286
I am not sure why it is displaying only the last column in dataframe instead of all the rows in beautified_value
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.worldometers.info/world-population/population-by-country/'
output = requests.get(url)
soup = BeautifulSoup(output.text, 'html.parser')
table = soup.find_all('table')
table = table[0]
columns = []
header_tags = table.find_all('th')
headers = [header.text.strip() for header in header_tags]
data_rows = table.find_all('tr')
for row in data_rows:
value = row.find_all('td')
beautified_value = [dp.text.strip() for dp in value]
#print(beautified_value)
df = pd.DataFrame(data=[beautified_value], columns=[headers])
You're not appending values to beautified_value, just rewriting it over and over. You can use list.append, for example:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.worldometers.info/world-population/population-by-country/"
output = requests.get(url)
soup = BeautifulSoup(output.text, "html.parser")
table = soup.find("table")
columns = []
header_tags = table.find_all("th")
headers = [header.text.strip() for header in header_tags]
data_rows = table.find_all("tr")[1:]
beautified_value = []
for row in data_rows:
value = row.find_all("td")
beautified_value.append(dp.text.strip() for dp in value)
df = pd.DataFrame(data=beautified_value, columns=headers)
print(df)
Prints:
# Country (or dependency) Population (2020) Yearly Change Net Change Density (P/Km²) Land Area (Km²) Migrants (net) Fert. Rate Med. Age Urban Pop % World Share
0 1 China 1,439,323,776 0.39 % 5,540,090 153 9,388,211 -348,399 1.7 38 61 % 18.47 %
1 2 India 1,380,004,385 0.99 % 13,586,631 464 2,973,190 -532,687 2.2 28 35 % 17.70 %
2 3 United States 331,002,651 0.59 % 1,937,734 36 9,147,420 954,806 1.8 38 83 % 4.25 %
...
Use read_html. Not that I had to set the user agent manually with requests, otherwise it would throw a 403 error:
import requests
import pandas as pd
df = pd.read_html(requests.get(url,headers={'User-agent': 'Mozilla/5.0'}).text)[0]
#
Country (or dependency)
Population (2020)
Yearly Change
Net Change
Density (P/Km²)
Land Area (Km²)
Migrants (net)
Fert. Rate
Med. Age
Urban Pop %
World Share
0
1
China
1439323776
0.39 %
5540090
153
9388211
-348399
1.7
38
61 %
18.47 %
1
2
India
1380004385
0.99 %
13586631
464
2973190
-532687
2.2
28
35 %
17.70 %
2
3
United States
331002651
0.59 %
1937734
36
9147420
954806
1.8
38
83 %
4.25 %
3
4
Indonesia
273523615
1.07 %
2898047
151
1811570
-98955
2.3
30
56 %
3.51 %
4
5
Pakistan
220892340
2.00 %
4327022
287
770880
-233379
3.6
23
35 %
2.83 %
en.wikipedia.org/wiki/List_of_neighbourhoods_of_Istanbul
in the link above, there is an un-tabulated data for Istanbul Neighborhoods.
I want to fetch these Neighborhoods into a data frame by this code
import pandas as pd
import requests
from bs4 import BeautifulSoup
wikiurl="https://en.wikipedia.org/wiki/List_of_neighbourhoods_of_Istanbul"
response=requests.get(wikiurl)
soup = BeautifulSoup(response.text, 'html.parser')
tocList=soup.findAll('a',{'class':"new"})
neighborhoods=[]
for item in tocList:
text = item.get_text()
neighborhoods.append(text)
df = pd.DataFrame(neighborhoods, columns=['Neighborhood'])
print(df)
and I got this output:
Neighborhood
0 Maden
1 Nizam
2 Anadolu
3 Arnavutköy İmrahor
4 Arnavutköy İslambey
... ...
705 Seyitnizam
706 Sümer
707 Telsiz
708 Veliefendi
709 Yeşiltepe
710 rows × 1 columns
but some data are not fetched, check the data below and compare to the output:
Adalar
Burgazada
Heybeliada
Kınalıada
Maden
Nizam
findall() is not fetching the Neighborhoods which referred as links, not class, i.e.
<ol><li>Burgazada</li>
<li>Heybeliada</li>
and can I develop the code into 2 columns, each 'Neighborhood' and its 'District'
Are you trying to fetch this list from Table of Contents ?
Please check if this solves your problem:
import pandas as pd
import requests
from bs4 import BeautifulSoup
wikiurl="https://en.wikipedia.org/wiki/List_of_neighbourhoods_of_Istanbul"
response=requests.get(wikiurl)
soup = BeautifulSoup(response.text, 'html.parser')
tocList=soup.findAll('span',{'class':"toctext"})
districts=[]
blocked_words = ['Neighbourhoods by districts','Further reading', 'External links']
for item in tocList:
text = item.get_text()
if text not in blocked_words:
districts.append(text)
df = pd.DataFrame(districts, columns=['districts'])
print(df)
Output:
districts
0 Adalar
1 Arnavutköy
2 Ataşehir
3 Avcılar
4 Bağcılar
5 Bahçelievler
6 Bakırköy
7 Başakşehir
8 Bayrampaşa
9 Beşiktaş
10 Beykoz
11 Beylikdüzü
12 Beyoğlu
13 Büyükçekmece
14 Çatalca
15 Çekmeköy
16 Esenler
17 Esenyurt
18 Eyüp
19 Fatih
20 Gaziosmanpaşa
21 Güngören
22 Kadıköy
23 Kağıthane
24 Kartal
25 Küçükçekmece
26 Maltepe
27 Pendik
28 Sancaktepe
29 Sarıyer
30 Silivri
31 Sultanbeyli
32 Sultangazi
33 Şile
34 Şişli
35 Tuzla
36 Ümraniye
37 Üsküdar
38 Zeytinburnu
it took me the whole day trying to fix this problem but I didn't found a solution so I hope you can help me. I already tried to extract the data from the website. But the problem is that I don't know how to split the list so that 500g converts to 500,g. The problem is that on the website sometimes the quantity is 1 and sometimes 1 1/2 kg or sth. And now I need to convert it into a CSV file and then into a MySQL database. What I want at the end is a CSV file with the columns: ingredients ID, ingredients, quantity, and the unit of the quantity from the ingredient. So for example:
0, meat, 500, g. This is the code I already have to extract the data from this website:
import re
from bs4 import BeautifulSoup
import requests
import csv
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
mainurl = "https://www.chefkoch.de/rs/s0e1n1z1b0i1d1,2,3/Rezepte.html"
urls_urls = []
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
ingredients = []
menge = []
def read_recipes():
for url, id2 in zip(urls_recipes, range(len(urls_recipes))):
soup2 = BeautifulSoup(requests.get(url).content, "lxml")
for ingredient in soup2.select('.td-left'):
menge.append([*[re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True))]])
for ingredient in soup2.select('.recipe-ingredients h3, .td-right'):
if ingredient.name == 'h3':
ingredients.append([id2, *[ingredient.get_text(strip=True)]])
else:
ingredients.append([id2, *[re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True))]])
read_recipes()
I hope you can help me Thank You!
It appears that the strings containing fractions use the unicode symbols for 1/2 etc., so I think a good way of starting is replacing those by looking up the specific code and passing it to str.replace(). Splitting up the units and the amount for this example was easy, since they are separated by a space. But it might be necessary to generalize this more if you encounter other combinations.
The following code works for this specific example:
import re
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
mainurl = "https://www.chefkoch.de/rs/s0e1n1z1b0i1d1,2,3/Rezepte.html"
urls_urls = []
urls_recipes = ['https://www.chefkoch.de/rezepte/3039711456645264/Ossobuco-a-la-Milanese.html']
ingredients = []
menge = []
einheit = []
for url, id2 in zip(urls_recipes, range(len(urls_recipes))):
soup2 = BeautifulSoup(requests.get(url).content)
for ingredient in soup2.select('.td-left'):
# get rid of multiple spaces and replace 1/2 unicode character
raw_string = re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True)).replace(u'\u00BD', "0.5")
# split into unit and number
splitlist = raw_string.split(" ")
menge.append(splitlist[0])
if len(splitlist) == 2:
einheit.append(splitlist[1])
else:
einheit.append('')
for ingredient in soup2.select('.recipe-ingredients h3, .td-right'):
if ingredient.name == 'h3':
continue
else:
ingredients.append([id2, re.sub(r'\s{2,}', ' ', ingredient.get_text(strip=True))])
result = pd.DataFrame(ingredients, columns=["ID", "Ingredients"])
result.loc[:, "unit"] = einheit
result.loc[:, "amount"] = menge
Output:
>>> result
ID Ingredients unit amount
0 0 Beinscheibe(n), vom Rind, ca. 4 cm dick geschn... 4
1 0 Mehl etwas
2 0 Zwiebel(n) 1
3 0 Knoblauchzehe(n) 2
4 0 Karotte(n) 1
5 0 Lauchstange(n) 1
6 0 Staudensellerie 0.5
7 0 Tomate(n), geschält Dose 1
8 0 Tomatenmark EL 1
9 0 Rotwein zum Ablöschen
10 0 Rinderfond oder Fleischbrühe Liter 0.5
11 0 Olivenöl zum Braten
12 0 Gewürznelke(n) 2
13 0 Pimentkörner 10
14 0 Wacholderbeere(n) 5
15 0 Pfefferkörner
16 0 Salz
17 0 Pfeffer, schwarz, aus der Mühle
18 0 Thymian
19 0 Rosmarin
20 0 Zitrone(n), unbehandelt 1
21 0 Knoblauchzehe(n) 2
22 0 Blattpetersilie Bund 1
I am trying to get the data which is present inside the div class "credit-list linelist"
From the site
https://www.usgbc.org/rpc/LEED%20V4%20BD+C:%20HOMR/v4/1593?location=Littlefield,%20Arizona&lat=36.9161976&lng=-113.95254890000001
Updated Code,
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import csv
import re
import json
import sys
import psycopg2
from pyvirtualdisplay import Display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
def resultSetGetter(databasename, ip, username, pw):
States = ['Nevada']
StateList = ','.join('?' for i in range(len(States))) # '?,?'
try:
con = psycopg2.connect(database=databasename, user=username, password=pw, host=ip, port=5432)
cur = con.cursor()
# newPrimaryCity = '\'' + primary_city + '\''
# queryString = "select DISTINCT primary_city from loc_zip_code_details where primary_city = " + newPrimaryCity + ";"
# print queryString
queryString = "select distinct(loc_zip_code_details.primary_city),state_name from loc_zip_code_details,loc_state where loc_zip_code_details.state_code = loc_state.state_code and loc_state.state_name IN (%s) ORDER BY state_name,loc_zip_code_details.primary_city limit 1"
in_p = ', '.join(list(map(lambda x: '%s', States)))
sql = queryString % in_p
cur.execute(sql, States)
# print cur.execute(queryString)
# print queryString
zipCodes = cur.fetchall()
print zipCodes
return zipCodes
# zipCodes= [x for x in foo if x!= ("Alba", "Texas")]
con.close()
# print zipCodes
except psycopg2.Error as leed_Error:
print leed_Error
def flatten(x):
result = []
for el in x:
if hasattr(el, "__iter__") and not isinstance(el, basestring):
result.extend(flatten(el))
else:
result.append(el)
return result
def leed_data(zipCodes):
for i in zipCodes:
driver = webdriver.Chrome(chrome_path)
time.sleep(3)
driver.get("http://www.usgbc.org/rpc")
driver.find_element_by_xpath('//*[#id="mainCol"]/div[1]/div[1]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[#id="mainCol"]/div[1]/div[1]/ul/li[10]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[#id="mainCol"]/div[1]/div[2]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="mainCol"]/div[1]/div[2]/ul/li[2]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="edit-address"]').clear()
# print i
driver.find_element_by_xpath('//*[#id="edit-address"]').send_keys(i)
time.sleep(3)
driver.find_element_by_xpath('//*[#id="geocode"]/div/div[1]/div[2]').click()
time.sleep(3)
driver.find_element_by_xpath('//*[#id="search-text"]').click()
time.sleep(3)
# alpha = driver.find_elements_by_class_name('views-field-field-category-logo-fid')
# temp = driver.find_element_by_xpath('//div[#class="credit-list linelist"]').extract()
# print alpha.text()
html_list = driver.find_element_by_xpath('//*[#id="mainCol"]/div[5]/ul')
items = html_list.find_elements_by_tag_name("li")
a, b, c, d, e, a1, b1, c1, d1, newList, e1 = [], [], [], [], [], [], [], [], [], [], []
for item in items:
txt = item.text.split('\n')
txt.append(i)
# print txt
txt[3] = txt[3].split(',')
newList = flatten(txt)
print newList
# txt = flatten(txt)
# print txt
# a1=re.search(pattern, txt[2]).group(0)
# b1=re.search(pattern, txt[1]).group(0)
newList[2] = int(''.join(w for w in txt[2] if w.isdigit()))
newList[1] = int(''.join(w for w in txt[1] if w.isdigit()))
a1 = newList[2]
b1 = newList[1]
c1=newList[0]
d1=newList[3]
e1=newList[4]
#d1=d1.split(',')
#print d1
#txt.append(d1)
# newList = newList.append(txt)
print newList
a.append(a1)
b.append(b1)
c.append(c1)
d.append(d1)
e.append(e1)
# print i
df = pd.DataFrame({'col1': a, 'col2': e, 'col5': d, 'col3':b, 'col4': c})
print df
driver.close()
#appendCsv = pd.DataFrame()
#appendCsv.append(df,ignore_index=True)
#new_df = pd.concat(appendCsv,ignore_index=True)
df.to_csv('LEED_NEVADA1.csv', index=False, header=False)
#return df
if __name__ == '__main__':
databasename = ""
ip = ""
username = ""
pw = ""
# print databasename,ip,username,pw
zipCodes = resultSetGetter(databasename, ip, username, pw)
zipCodes = [','.join(map(lambda x: x.replace(' ', ''), y)) for y in zipCodes]
# display = Display(visible=0, size=(800, 600))
# display.start()
chrome_options = Options()
chrome_options.add_experimental_option('prefs', {
'credentials_enable_service': False,
'profile': {
'password_manager_enabled': False
}
})
chrome_path = r"/usr/bin/chromedriver"
finalList = leed_data(zipCodes)
#print finalList
#df = pd.DataFrame(finalList)
#finalList.to_csv('Leed1.csv', index=False, header=False)
My input File Will be as a CSV from the database,
"LittleField,Arizona"
"Gilbert,Ariznona"
I got the output like,
0 20 Nevada 30 Annual energy use Alamo
1 3 Nevada 5 Efficient hot water distribution system Alamo
2 1 Nevada 1 No environmental tobacco smoke Alamo
3 2 Nevada 3 Compact development Alamo
4 2 Nevada 3 Construction waste management Alamo
5 7 Nevada 12 Total water use Alamo
6 4 Nevada 6 Indoor water use Alamo
0 20 Nevada 30 Annual energy use AmargosaValley
1 1 Nevada 1 No environmental tobacco smoke AmargosaValley
2 2 Nevada 3 Compact development AmargosaValley
3 1 Nevada 2 Community resources AmargosaValley
4 2 Nevada 3 Construction waste management AmargosaValley
5 7 Nevada 12 Total water use AmargosaValley
6 4 Nevada 6 Indoor water use AmargosaValley
I want to append that output into a csv.
You can either use regex + pandas dataframe or string-digit-check + pandas dataframe:
%%timeit
html_list = driver.find_element_by_xpath('//*[#id="mainCol"]/div[5]/ul')
items = html_list.find_elements_by_tag_name("li")
a,b,c,a1,b1,c1 = [],[],[],[],[],[]
#pattern = re.compile(r'\d+')
for item in items:
txt = item.text.split('\n')
#a1=re.search(pattern, txt[2]).group(0)
#b1=re.search(pattern, txt[1]).group(0)
a1=int(''.join(w for w in txt[2] if w.isdigit()))
b1=int(''.join(w for w in txt[1] if w.isdigit()))
c1=txt[0]
a.append(a1)
b.append(b1)
c.append(c1)
df = pd.DataFrame({'col1': a, 'col2': 'Littlefield Arizona', 'col3':b, 'col4': c})
col1 col2 col3 col4
0 10 Littlefield Arizona 30 Annual energy use
1 2 Littlefield Arizona 3 Compact development
2 1 Littlefield Arizona 2 Access to transit
3 1 Littlefield Arizona 2 Heat island reduction
4 2 Littlefield Arizona 3 Rainwater management
5 6 Littlefield Arizona 12 Total water use
6 4 Littlefield Arizona 6 Indoor water use
114 ms ± 3.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
I've commented an alternative: using regex to precompile and search for the regex.