Looking for alternative to Selenium for scraping multiple pages - python
I get the desired results but I think some the code could be improved. It's currently quite slow and error prone when scraping multiple pages in a row. The code below scrapes 5 features for 42 vehicles (21 per page). I'm scraping a total of 18 features (other 13 features are not shown here) for these two pages but it takes too long considering I wish to scrape a total of 29 pages.
In order to see the vehicle price you need to log in which is why I'm using Selenium as shown in the code below.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from selenium import webdriver
import time
from IPython.core.interactiveshell import InteractiveShell #optional
# Change cell settings (optional)
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = None
driver = webdriver.Chrome()
#driver.maximize_window() #optional
# Log in and search
urls = ["https://www.example.com/"]
for url in urls:
driver.get(url)
time.sleep(1)
driver.find_elements_by_class_name("dropdown-toggle")[0].click()
time.sleep(1)
driver.find_elements_by_name('email')[0].send_keys("arjenvgeffen#hotmail.com")
time.sleep(1)
driver.find_elements_by_name("submit")[0].click()
time.sleep(2)
link = driver.find_element_by_link_text('SEARCH')
time.sleep(1)
link.click()
time.sleep(2)
driver.find_elements_by_name("searchScope")[0].send_keys('ALL PAST')
time.sleep(1)
driver.find_elements_by_name("searchMake")[0].send_keys('PLYMOUTH')
time.sleep(1)
driver.find_elements_by_name('searchModel')[0].send_keys('Cuda')
time.sleep(1)
driver.find_elements_by_name('searchYearStart')[0].send_keys("1970")
time.sleep(1)
driver.find_elements_by_name('searchYearEnd')[0].send_keys("1971")
time.sleep(1)
driver.find_element_by_xpath("//button[. = 'Search']").click()
time.sleep(1)
The code below scrapes the vehicle title (year_make_model_type), price (which you can only see after loggin in above with email) and the page urls. The page_urls will be used in the next step to scrape information per product page. This takes too long when scraping 29 pages and it tends to skip/get stuck. Any improvement here is much appreciated!
# Scrape two pages (these two variables can be scraped without being on the vehicle page)
i = 0
x = 1
year_make_model_type = []
price = []
while True:
for i in range(0,1):
time.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
time.sleep(2)
urls = [x.get('href') for x in soup.findAll("a", class_ = "lot-title")]
time.sleep(2)
mystring = 'https://www.example.com'
page_urls = [mystring + s for s in urls]
time.sleep(2)
for y in soup.find_all("a", class_ = ("lot-title")):
year_make_model_type.append(y.text)
time.sleep(2)
for p in soup.find_all("span", class_ = ("lot-price")):
price.append(re.sub("[\$\,]", "", p.text))
time.sleep(2)
i +=1
for x in range(2,3):
time.sleep(5)
driver.find_element_by_xpath('//a[#href="/search/page/%d/"]' % (x,)).click()
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
time.sleep(2)
page_products_urls = [x.get('href') for x in soup.findAll("a", class_ = "lot-title")]
time.sleep(2)
mystring = 'https://www.example.com'
page_products_urls2 = [mystring + s for s in page_products_urls]
page_urls.extend(page_products_urls2)
time.sleep(2)
for y in soup.find_all("a", class_ = ("lot-title")):
year_make_model_type.append(y.text)
time.sleep(2)
for p in soup.find_all("span", class_ = ("lot-price")):
price.append(re.sub("[\$\,]", "", p.text))
time.sleep(2)
x += 1
if x == 2:
break
else:
break
len(page_urls) #42
len(set(page_urls)) #42
len(price) #42
len(set(price)) #36
len(year_make_model_type) #42
len(set(year_make_model_type)) #13
# If you need to go back to the first page
#driver.find_element_by_xpath('//a[#href="/search/page/1/"]').click()
# Create df
scraped_data = pd.DataFrame({'url': page_urls, 'year_make_model_type': year_make_model_type, 'price':price})
scraped_data['price'] = scraped_data['price'].replace('', np.NaN)
scraped_data['price'] = scraped_data['price'].astype(float)
scraped_data.shape
scraped_data.head()
#driver.quit()
This last bit of code scrapes the highlights and flag_group per vehicle from its product page.
# Create additional features per product url (have to click on product to be able to scrape these features)
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
highlights = []
flag_group = []
# Add features per vehicle
for url in page_urls:
# Vehicle highlights
highlights1 = []
soup = getAndParseURL(url)
if not soup.find("ul", class_ = "lot-highlights hidden-print"):
highlights1.append(np.NaN)
else:
hl = soup.find("ul", class_ = "lot-highlights hidden-print").text.strip()
hl = hl.replace("\n", ", ").strip()
highlights1.append(hl)
highlights.extend(highlights1)
# Vehicle flag_group
attraction = []
soup = getAndParseURL(url)
flag = soup.find(class_=["flag flag-main","flag flag-star", "flag flag-feature"])
if flag:
attraction.append(flag.contents[0])
else:
attraction.append(np.NaN)
flag_group.extend(attraction)
# Assign new features to existing df
scraped_data = scraped_data.assign(**{'highlights': highlights, 'flag_group': flag_group})#, 'reserve': reserve})
scraped_data.shape
scraped_data.head()
Let me know/show me wherever you think the code above can be improved. Thanks for taking the time!
You Really Really don't need all this very long code at all.
You don't need even selenium.
You don't need to keep repeat your code and all this stuff.
Below should achieve your goal easily!
Note: I've scraped only the first 3 pages, You can increase the loop for your desired target.
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
data = {
"searchScope": "past",
"searchText": "PLYMOUTH",
"searchMake": "Plymouth",
"searchModel": "Cuda",
"searchYearStart": "1970",
"searchYearEnd": "1971",
"submit": ""
}
headers = {
"Referer": "https://www.mecum.com",
}
login = {"email": "arjenvgeffen#hotmail.com"}
def main(url):
with requests.Session() as req:
r = req.post(
"https://www.mecum.com/includes/login-action.cfm", data=login)
p = PrettyTable()
p.field_names = ["Name", "Url", "Price"]
for item in range(1, 4):
r = req.post(url.format(item), data=data, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("div.lot")
for tar in target:
price = tar.span.text if tar.span.text else "N/A"
hint = tar.select_one("a.lot-title")
p.add_row(
[hint.text, f"{url[:21]}{hint['href']}", price])
print(p)
main("https://www.mecum.com/search/page/{}/")
Output:
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
| Name | Url | Price |
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
| 1936 Plymouth Coupe | https://www.mecum.com/lots/HA0420-412309/1936-plymouth-coupe/ | N/A |
| 1937 Plymouth Deluxe Pickup | https://www.mecum.com/lots/HA0420-412385/1937-plymouth-deluxe-pickup/ | N/A |
| 1951 Plymouth Convertible | https://www.mecum.com/lots/HA0420-412744/1951-plymouth-convertible/ | N/A |
| 1968 Plymouth Road Runner | https://www.mecum.com/lots/HA0420-412874/1968-plymouth-road-runner/ | N/A |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/HA0420-413047/1970-plymouth-cuda/ | N/A |
| 1971 Plymouth Cuda Convertible | https://www.mecum.com/lots/HA0420-413138/1971-plymouth-cuda-convertible/ | N/A |
| 1968 Plymouth Road Runner | https://www.mecum.com/lots/HA0420-427812/1968-plymouth-road-runner/ | N/A |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-404226/1969-plymouth-road-runner/ | $19,250 |
| 1973 Plymouth Duster Police Car | https://www.mecum.com/lots/AZ0320-404232/1973-plymouth-duster-police-car/ | $18,700 |
| 1963 Plymouth Valiant Signet 200 Convertible | https://www.mecum.com/lots/AZ0320-404250/1963-plymouth-valiant-signet-200-convertible/ | $3,850 |
| 1946 Plymouth Taxi | https://www.mecum.com/lots/AZ0320-404267/1946-plymouth-taxi/ | $3,300 |
| 1969 Plymouth GTX | https://www.mecum.com/lots/AZ0320-404449/1969-plymouth-gtx/ | $25,000 |
| 1999 Plymouth Prowler | https://www.mecum.com/lots/AZ0320-404457/1999-plymouth-prowler/ | $20,000 |
| 1967 Plymouth Barracuda Formula S Fastback | https://www.mecum.com/lots/AZ0320-404478/1967-plymouth-barracuda-formula-s-fastback/ | $33,000 |
| 1970 Plymouth Cuda Convertible | https://www.mecum.com/lots/AZ0320-404626/1970-plymouth-cuda-convertible/ | $51,700 |
| 1967 Plymouth GTX | https://www.mecum.com/lots/AZ0320-404634/1967-plymouth-gtx/ | $31,350 |
| 1970 Plymouth Cuda Resto Mod | https://www.mecum.com/lots/AZ0320-404636/1970-plymouth-cuda-resto-mod/ | $50,000 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-404656/1969-plymouth-road-runner/ | $34,100 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/AZ0320-404858/1970-plymouth-cuda/ | $70,000 |
| 1970 Plymouth Superbird | https://www.mecum.com/lots/AZ0320-404866/1970-plymouth-superbird/ | $143,000 |
| 1967 Plymouth Satellite Convertible | https://www.mecum.com/lots/AZ0320-404883/1967-plymouth-satellite-convertible/ | $30,800 |
| 1970 Plymouth AAR Cuda | https://www.mecum.com/lots/AZ0320-404897/1970-plymouth-aar-cuda/ | $71,500 |
| 1967 Plymouth Barracuda Resto Mod | https://www.mecum.com/lots/AZ0320-404918/1967-plymouth-barracuda-resto-mod/ | $60,500 |
| 1969 Plymouth GTX Convertible | https://www.mecum.com/lots/AZ0320-404950/1969-plymouth-gtx-convertible/ | $42,000 |
| 1959 Plymouth Sport Fury | https://www.mecum.com/lots/AZ0320-404972/1959-plymouth-sport-fury/ | $30,000 |
| 1965 Plymouth Barracuda | https://www.mecum.com/lots/AZ0320-405120/1965-plymouth-barracuda/ | $22,000 |
| 1970 Plymouth Hemi Cuda | https://www.mecum.com/lots/AZ0320-405220/1970-plymouth-hemi-cuda/ | $150,700 |
| 1970 Plymouth Superbird | https://www.mecum.com/lots/AZ0320-405229/1970-plymouth-superbird/ | $115,000 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/AZ0320-405236/1970-plymouth-cuda/ | $52,500 |
| 1970 Plymouth Hemi Cuda | https://www.mecum.com/lots/AZ0320-405266/1970-plymouth-hemi-cuda/ | $130,000 |
| 1968 Plymouth Hemi Road Runner | https://www.mecum.com/lots/AZ0320-405267/1968-plymouth-hemi-road-runner/ | $70,000 |
| 1969 Plymouth Hemi Road Runner | https://www.mecum.com/lots/AZ0320-405286/1969-plymouth-hemi-road-runner/ | $62,000 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-405304/1969-plymouth-road-runner/ | $120,000 |
| 1959 Plymouth Sport Fury Convertible | https://www.mecum.com/lots/AZ0320-405321/1959-plymouth-sport-fury-convertible/ | $70,000 |
| 1973 Plymouth Cuda Resto Mod | https://www.mecum.com/lots/AZ0320-405340/1973-plymouth-cuda-resto-mod/ | $75,000 |
| 1969 Plymouth Sport Satellite Convertible | https://www.mecum.com/lots/AZ0320-405384/1969-plymouth-sport-satellite-convertible/ | $37,400 |
| 1970 Plymouth AAR Cuda | https://www.mecum.com/lots/AZ0320-405385/1970-plymouth-aar-cuda/ | $55,000 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-423532/1969-plymouth-road-runner/ | $60,500 |
| 1970 Plymouth Hemi Cuda | https://www.mecum.com/lots/AZ0320-423534/1970-plymouth-hemi-cuda/ | $93,500 |
| 1968 Plymouth Hemi Road Runner | https://www.mecum.com/lots/AZ0320-423535/1968-plymouth-hemi-road-runner/ | $66,000 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/AZ0320-423545/1970-plymouth-cuda/ | $60,000 |
| 1940s-50s Desoto Plymouth Double-Sided Porcelain 45x42 | https://www.mecum.com/lots/AZ0320-424465/1940s-50s-desoto-plymouth-double-sided-porcelain/ | $2,950 |
| 1940s-50s Dodge Plymouth Double-Sided Porcelain 42-in | https://www.mecum.com/lots/AZ0320-424468/1940s-50s-dodge-plymouth-double-sided-porcelain/ | $5,900 |
| 1940s-50s Chrysler Plymouth Double-Sided Porcelain 42-in | https://www.mecum.com/lots/AZ0320-424471/1940s-50s-chrysler-plymouth-double-sided-porcelain/ | $3,776 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-424624/1969-plymouth-road-runner/ | $59,400 |
| 1965 Plymouth Sport Fury Convertible | https://www.mecum.com/lots/AZ0320-424629/1965-plymouth-sport-fury-convertible/ | $13,750 |
| 1970 Plymouth Road Runner Convertible | https://www.mecum.com/lots/AZ0320-428253/1970-plymouth-road-runner-convertible/ | $45,000 |
| 1970 Plymouth Barracuda Convertible | https://www.mecum.com/lots/AZ0320-428658/1970-plymouth-barracuda-convertible/ | $42,900 |
| 1966 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-394693/1966-plymouth-barracuda/ | $9,625 |
| 1965 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-394746/1965-plymouth-barracuda/ | $7,700 |
| 1969 Plymouth Satellite | https://www.mecum.com/lots/FL0120-394747/1969-plymouth-satellite/ | $3,850 |
| 1954 Plymouth Savoy | https://www.mecum.com/lots/FL0120-394753/1954-plymouth-savoy/ | $7,150 |
| 1952 Plymouth Police Car | https://www.mecum.com/lots/FL0120-394828/1952-plymouth-police-car/ | N/A |
| 1970 Plymouth Duster | https://www.mecum.com/lots/FL0120-394921/1970-plymouth-duster/ | $26,400 |
| 1965 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-394956/1965-plymouth-barracuda/ | $8,800 |
| 1950 Plymouth Special Deluxe | https://www.mecum.com/lots/FL0120-394983/1950-plymouth-special-deluxe/ | $8,250 |
| 1973 Plymouth Road Runner | https://www.mecum.com/lots/FL0120-395009/1973-plymouth-road-runner/ | $21,000 |
| 1970 Plymouth Road Runner | https://www.mecum.com/lots/FL0120-395013/1970-plymouth-road-runner/ | $51,700 |
| 1969 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-395106/1969-plymouth-barracuda/ | $17,600 |
| 1966 Plymouth Satellite Convertible | https://www.mecum.com/lots/FL0120-395145/1966-plymouth-satellite-convertible/ | $26,400 |
| 1970 Plymouth Road Runner | https://www.mecum.com/lots/FL0120-395341/1970-plymouth-road-runner/ | $47,300 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/FL0120-395362/1970-plymouth-cuda/ | $61,000 |
| 1999 Plymouth Prowler Convertible | https://www.mecum.com/lots/FL0120-395647/1999-plymouth-prowler-convertible/ | $30,800 |
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
I've edited Ahmed's code to get my desired output (pandas df)
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
data = {
"searchScope": "past",
"searchMake": "Plymouth",
"searchModel": "Cuda",
"searchYearStart": "1970",
"searchYearEnd": "1971",
"submit": ""
}
headers = {
"Referer": "https://www.example.com",
}
login = {"email": "example#hotmail.com"}
price = []
urls = []
title = []
results = []
def main(url):
with requests.Session() as req:
r = req.post(
"https://www.example.com/includes/login-action.cfm", data=login)
for item in range(1, 30):
r = req.post(url.format(item), data=data, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("div.lot")
for tar in target:
urls.append(tar.a.get('href'))
title.append(tar.select_one("a.lot-title").text)
price.append(tar.span.text if tar.span.text else np.NaN)
r = tar.select("div[class*=lot-image-container]")
for result in results2:
results.append(' '.join(result2['class']))
main("https://www.example.com/search/page/{}/")
scraped_data = pd.DataFrame({'url': urls, 'year_make_model_type': title, 'price':price, 'results': results})
scraped_data.shape
scraped_data["results"] = scraped_data["results"].str.replace("lot-image-container", "")
scraped_data["results"] = scraped_data["results"].replace('', np.NaN)
scraped_data.head()
Now I want to extract features from the list of product pages which are in the column 'url'. Below a working example but it's way too slow. I've tried fixing it with multiprocessing but I haven't figured it out yet. I want to extract about 10 more features for 500+ pages so it has to be faster than this.
low_url = ['https://www.mecum.com/lots/KC1210-101030/1970-plymouth-cuda/',
'https://www.mecum.com/lots/SC0510-91294/1970-plymouth-hemi-cuda/',
'https://www.mecum.com/lots/KC1210-100686/1970-plymouth-barracuda-convertible/',
'https://www.mecum.com/lots/KA0316-235834/1970-plymouth-barracuda-convertible/',
'https://www.mecum.com/lots/FL0110-88180/1970-plymouth-barracuda/']
reserve = []
with requests.Session() as req:
for url in low_url:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
attraction2 = []
if not soup.find(class_=["flag flag-no-reserve"]):
attraction2.append(np.NaN)
else:
r = soup.find(class_=["flag flag-no-reserve"])
attraction2.append(r.contents[0])
reserve.extend(attraction2)
len(reserve)
len(set(reserve))
reserve
Out: ['No Reserve', nan, nan, 'No Reserve', nan]
Related
Unable to fetch the entire data from kafka topic to cassandra using python
I want the data from MySQL to Cassandra in real time using Apache Kafka. Here is my producer code in python import json from kafka import KafkaProducer import pymysql.cursors producer = KafkaProducer(bootstrap_servers=['localhost:9092']) connection = pymysql.connect(host='127.0.0.1', user='root', port=3306, password='Mysql#123', db='bank_transaction', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) cursor = connection.cursor() sql = "Select * from transactions" cursor.execute(sql) rows = cursor.fetchall() data = "" for row in rows: producer.send('demo', json.dumps(row).encode("utf-8")) cursor.close() connection.close() # configure multiple retries producer = KafkaProducer(retries=5) This is the sample output I'm getting after running a simple kafka consumer {u'status': u'INITIATED', u'stped': u'STP', u'remark': u'adkaDKA', u'cr_cust_id': 4321, u'txn_Id': u'FT123456', u'currency': u'USD', u'dr_cust_type': u'Retail', u'dr_cust_id': 1234, u'bank_user': u'FO MAKER', u'txn_start_date_time': u'3/1/2022 8:00', u'txn_code': u'FT001', u'dept': u'FRONT OFFICE', u'txn_end_date_time': u'3/1/2022 8:30', u'source': u'Mobile', u'amount': 1000, u'dr_cust_acct': 1234567890, u'txn_Type': u'Fund Transfer', u'dr_cust_name': u'Vimal', u'cr_cust_name': u'Vivek', u'cr_cust_type': u'Retail', u'cr_cust_acct': 987654321} {u'status': u'INITIATED', u'stped': u'STP', u'remark': u'adkaDKA', u'cr_cust_id': 4321, u'txn_Id': u'FT123456', u'currency': u'USD', u'dr_cust_type': u'Retail', u'dr_cust_id': 1234, u'bank_user': u'FO CHECKER', u'txn_start_date_time': u'3/1/2022 8:00', u'txn_code': u'FT001', u'dept': u'FRONT OFFICE', u'txn_end_date_time': u'3/1/2022 8:30', u'source': u'Mobile', u'amount': 1000, u'dr_cust_acct': 1234567890, u'txn_Type': u'Fund Transfer', u'dr_cust_name': u'Vimal', u'cr_cust_name': u'Vivek', u'cr_cust_type': u'Retail', u'cr_cust_acct': 987654321} Here is the consumer code to store the data from kafka topic to cassandra from encodings import utf_8 from kafka import KafkaConsumer import json from cassandra.cluster import Cluster from cassandra.policies import DCAwareRoundRobinPolicy def interpret_constant(c): try: if str(int(c)) == c: return int(c) except ValueError: pass try: if str(float(c)) == c: return float(c) except ValueError: return c cluster = Cluster() session = cluster.connect('test') print("After connecting to kafka") consumer = KafkaConsumer('demo', group_id='my-group', bootstrap_servers=['localhost:9092']) def insert(message): msg = message.value.decode('utf-8') #print(msg) msg = json.loads(msg) print(msg) keys = ",".join(msg.keys()) values = ','.join(str(v) for v in msg.values()) user_insert_stmt = session.prepare("insert into response ({0}) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)".format(keys)) #msg = json.loads(message.message.value.decode('utf-8')) #msg = message.message.value.split("|") #print(msg) new_msg = [interpret_constant(x) for x in msg.values()] #print(new_msg) return session.execute(user_insert_stmt,new_msg) for message in consumer: insert (message) After running the above code, I'm only getting 9 rows of data in cassandra (it should be 49). And the int data type values not showing in cassandra select * from response; txn_id | amount | bank_user | cr_cust_acct | cr_cust_id | cr_cust_name | cr_cust_type | currency | dept | dr_cust_acct | dr_cust_id | dr_cust_name | dr_cust_type | remark | source | status | stped | txn_code | txn_end_date_time | txn_start_date_time | txn_type -----------+--------+------------+--------------+------------+--------------+--------------+----------+--------------+--------------+------------+--------------+--------------+---------+---------------+-----------+-------+----------+-------------------+---------------------+----------------- AC123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | HNI-VIP | adkaDKA | Mobile | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | ACCT TO ACCT TT123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | SME | adkaDKA | IB | COMPLETED | STP | TT0001 | 3/1/2022 8:30 | 3/1/2022 8:00 | CHEQUE DEPOSIT FT123456 | null | SYS | null | null | Vivek | Retail | USD | SYSTEM | null | null | Vimal | Retail | adkaDKA | BRANCH MANUAL | COMPLETED | NSTP | | 3/1/2022 8:30 | 3/1/2022 8:00 | Fund Transfer FT1234567 | null | FO MAKER | null | null | Kunal | Retail | USD | FRONT OFFICE | null | null | Manan | Retail | adkaDKA | Mobile | INITIATED | STP | TT0001 | 3/1/2022 8:30 | 3/1/2022 8:00 | Fund Transfer FTM12345 | null | BO CHECKER | null | null | Vivek | CORP | USD | BACK OFFICE | null | null | Vimal | CORP | adkaDKA | Mobile | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | DIRECT DEBIT FD123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | CORP | adkaDKA | BRANCH MANUAL | COMPLETED | STP | FD0001 | 3/1/2022 8:30 | 3/1/2022 8:00 | FIXED DEPOSIT MC123456 | null | BO CHECKER | null | null | Vivek | HNI-VIP | USD | BACK OFFICE | null | null | Vimal | HNI-VIP | adkaDKA | IB | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | MANAGERS CHEQUE RR123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | Retail | adkaDKA | IB | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | CHARGE REVERSAL CHG123456 | null | BO CHECKER | null | null | Vivek | SME | USD | BACK OFFICE | null | null | Vimal | SME | adkaDKA | BRANCH MANUAL | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | CHARGE POSTING Cassandra table description CREATE TABLE test.response ( txn_id text PRIMARY KEY, amount int, bank_user text, cr_cust_acct int, cr_cust_id int, cr_cust_name text, cr_cust_type text, currency text, dept text, dr_cust_acct int, dr_cust_id int, dr_cust_name text, dr_cust_type text, remark text, source text, status text, stped text, txn_code text, txn_end_date_time text, txn_start_date_time text, txn_type text ) WITH additional_write_policy = '99p' AND bloom_filter_fp_chance = 0.01 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'} AND cdc = false AND comment = '' AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'} AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'} AND crc_check_chance = 1.0 AND default_time_to_live = 0 AND extensions = {} AND gc_grace_seconds = 864000 AND max_index_interval = 2048 AND memtable_flush_period_in_ms = 0 AND min_index_interval = 128 AND read_repair = 'BLOCKING' AND speculative_retry = '99p'; Python version 2.7, Kafka version 2.7.0, Cassandra version4 4.0.1. What changes to make if I want the entire data from kafka topic to Cassandra?
SAS Programming: How to replace missing values in multiple columns using one column?
Background I have a large dataset in SAS that has 17 variables of which four are numeric and 13 character/string. The original dataset that I am using can be found here: https://www.kaggle.com/austinreese/craigslist-carstrucks-data. cylinders condition drive paint_color type manufacturer title_status model fuel transmission description region state price (num) posting_date (num) odometer (num) year (num) After applying specific filters to the numeric columns, there are no missing values for each numeric variable. However, there are thousands to hundreds of thousands of missing variables for the remaining 14 char/string variables. Request Similar to the blog post towards data science as shown here (https://towardsdatascience.com/end-to-end-data-science-project-predicting-used-car-prices-using-regression-1b12386c69c8), specifically under the Feature Engineering section, how can I write the equivalent SAS code where I use regex on the description column to fill missing values of the other string/char columns with categorical values such as cylinders, condition, drive, paint_color, and so on? Here is the Python code from the blog post. import re manufacturer = '(gmc | hyundai | toyota | mitsubishi | ford | chevrolet | ram | buick | jeep | dodge | subaru | nissan | audi | rover | lexus \ | honda | chrysler | mini | pontiac | mercedes-benz | cadillac | bmw | kia | volvo | volkswagen | jaguar | acura | saturn | mazda | \ mercury | lincoln | infiniti | ferrari | fiat | tesla | land rover | harley-davidson | datsun | alfa-romeo | morgan | aston-martin | porche \ | hennessey)' condition = '(excellent | good | fair | like new | salvage | new)' fuel = '(gas | hybrid | diesel |electric)' title_status = '(clean | lien | rebuilt | salvage | missing | parts only)' transmission = '(automatic | manual)' drive = '(4x4 | awd | fwd | rwd | 4wd)' size = '(mid-size | full-size | compact | sub-compact)' type_ = '(sedan | truck | SUV | mini-van | wagon | hatchback | coupe | pickup | convertible | van | bus | offroad)' paint_color = '(red | grey | blue | white | custom | silver | brown | black | purple | green | orange | yellow)' cylinders = '(\s[1-9] cylinders? |\s1[0-6]? cylinders?)' keys = ['manufacturer', 'condition', 'fuel', 'title_status', 'transmission', 'drive','size', 'type', 'paint_color' , 'cylinders'] columns = [ manufacturer, condition, fuel, title_status, transmission ,drive, size, type_, paint_color, cylinders] for i,column in zip(keys,columns): database[i] = database[i].fillna( database['description'].str.extract(column, flags=re.IGNORECASE, expand=False)).str.lower() database.drop('description', axis=1, inplace= True) What would be the equivalent SAS code for the Python code shown above?
It's basically just doing a word search of sorts. A simplified example in SAS: data want; set have; array _fuel(*) $ _temporary_ ("gas", "hybrid", "diesel", "electric"); do i=1 to dim(_fuel); if find(description, _fuel(i), 'it')>0 then fuel = _fuel(i); *does not deal with multiple finds so the last one found will be kept; end; run; You can expand this by creating an array for each variable and then looping through your lists. I think you can replace the loop with a REGEX command as well in SAS but regex requires too much thinking so someone else will have to provide that answer.
My len(count) matches but I get IndexError: list index out of range. What am I doing wrong?
I am very new to web scraping and I am trying different ways to run this code that works with the same tabular scraping on the same website (different URL though) but I am getting nowhere. working code: browser = webdriver.Chrome() urls = { "https://www.oddsportal.com/soccer/england/premier-league/results/" } class GameData: def __init__(self): self.date = [] self.time = [] self.game = [] self.score = [] self.home_odds = [] self.draw_odds = [] self.away_odds = [] self.country = [] self.league = [] def parse_data(url): browser.get(url) df = pd.read_html(browser.page_source, header=0)[0] html = browser.page_source soup = bs(html, "lxml") cont = soup.find('div', {'id': 'wrap'}) content = cont.find('div', {'id': 'col-content'}) content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'}) main = content.find('th', {'class': 'first2 tl'}) if main is None: return None count = main.findAll('a') country = count[1].text league = count[2].text game_data = GameData() game_date = None for row in df.itertuples(): if not isinstance(row[1], str): continue elif ':' not in row[1]: game_date = row[1].split('-')[0] continue game_data.date.append(game_date) game_data.time.append(row[1]) game_data.game.append(row[2]) game_data.score.append(row[3]) game_data.home_odds.append(row[4]) game_data.draw_odds.append(row[5]) game_data.away_odds.append(row[6]) game_data.country.append(country) game_data.league.append(league) return game_data if __name__ == '__main__': results = None for url in urls: try: game_data = parse_data(url) if game_data is None: continue result = pd.DataFrame(game_data.__dict__) if results is None: results = result else: results = results.append(result, ignore_index=True) except ValueError: game_data = parse_data(url) if game_data is None: continue result = pd.DataFrame(game_data.__dict__) if results is None: results = result except AttributeError: game_data = parse_data(url) if game_data is None: continue result = pd.DataFrame(game_data.__dict__) if results is None: results = result else: results = results.append(result, ignore_index=True) df: | | date | time | game | score | home_odds | draw_odds | away_odds | country | league | |----|-------------------|--------|----------------------------------|---------|-------------|-------------|-------------|-----------|----------------| | 0 | Yesterday, 11 May | 19:15 | Southampton - Crystal Palace | 3:1 | 1.89 | 3.8 | 4.11 | England | Premier League | | 1 | Yesterday, 11 May | 17:00 | Manchester Utd - Leicester | 1:2 | 3.72 | 3.58 | 2.07 | England | Premier League | | 2 | 10 May 2021 | 19:00 | Fulham - Burnley | 0:2 | 2.24 | 3.44 | 3.38 | England | Premier League | | 3 | 09 May 2021 | 18:00 | Arsenal - West Brom | 3:1 | 1.5 | 4.53 | 6.76 | England | Premier League | | 4 | 09 May 2021 | 15:30 | West Ham - Everton | 0:1 | 2.15 | 3.56 | 3.48 | England | Premier League | Here's what I have found as far as differences go: Xpath of working code url: urls = { "https://www.oddsportal.com/soccer/england/premier-league/results/" } //*[#id="tournamentTable"]/tbody/tr[4]/td[2]/a Xpath of desired url: urls = { "https://www.oddsportal.com/matches/soccer/20210515/" } //*[#id="table-matches"]/table/tbody/tr[2]/td[2]/a[2] When I run if main is None: return None count = main.findAll('a') print(len(count)) I get 2 I had asked this question before and tried content = cont.find('div', {'id': 'col-content'}) content = content.find('table', {'class': 'table-main'}, {'id': 'table-matches'}) main = content.find('th', {'class': 'first2 tl'}) However I am more confused than before. Any guidance will be much appreciated.
Your variable count has a len of 2. Python indexes start in 0, this means count[2] will give you an error (There are only 2 elements in the list). Please change country = count[1].text league = count[2].text to country = count[0].text league = count[1].text
Since you're using selenium anyway and the site has jQuery: data = driver.execute_script(''' let [country, league] = $('.bflp + a').get().map(a => a.innerText.trim()) return $('tr.deactivate').get().map(tr => { let tds = $(tr).find('td').get().map(td => td.innerText.trim()) return { date: $(tr).prevAll('tr.center').find('th').first().text().trim(), time: tds[0], game: tds[1], score: tds[2], home_odds: tds[3], draw_odds: tds[4], away_odds: tds[5], country, league } }) ''') If you don't like that you can just use the bit that gets country and league.
Web scraping using beautiful soup is giving inaccurate results
So I am using beautiful soup and trying to get list of companies from the website https://www.weps.org/companies . This function I have made simply takes the url "https://www.weps.org/companies?combine=&field_sector_target_id=All&field_company_type_value=All&field_number_of_employees_value=All&field_region_target_id=All&field_country_target_id=All&page=0" and adds 1 at the last digit till its 310 to get the list from all the pages .Then simplet get text is used to get the data and saved to csv . I got almost complete list , but some are not in chronological orders and sometimes some are repeated too . I think basically 95% or more of the data is accurate but some are altered . What could be the reason ? This is my code : #!/usr/bin/python3 import requests from bs4 import BeautifulSoup import pandas as pd company = [] types = [] requrl = "https://www.weps.org/companies?combine=&field_sector_target_id=All&field_company_type_value=All&field_number_of_employees_value=All&field_region_target_id=All&field_country_target_id=All&page=0" reqlist = list(requrl) j = 0 for i in range(0, 310): reqlist[-1] = j j = j + 1 listToStr = ''.join([str(elem) for elem in reqlist]) page = requests.get(listToStr) soup = BeautifulSoup(page.content, 'html.parser') company_only = soup.select(".field-content .skiptranslate") company = company + [cm.get_text() for cm in company_only] types_only = soup.select(".views-field-nothing .field-content") types = types + [tp.get_text() for tp in types_only] data = pd.DataFrame({ 'Name': company, 'Type | Location | Date': types# 'Type | Location | Data': types }) data.to_csv(r 'finalfile.csv', index = False)
I tried tidying you code and using requests.session(). Your range is wrong it only goes to page 309. I stripped white space to make it easier to parse. #!/usr/bin/python3 import requests from bs4 import BeautifulSoup import pandas as pd session = requests.session() company = [] types = [] base_url = "https://www.weps.org/companies?combine=&field_sector_target_id=All&field_company_type_value=All&field_number_of_employees_value=All&field_region_target_id=All&field_country_target_id=All&page=" # The last page with data on is 310 so use range(0, 311). for i in range(0, 311): page = session.get(f'{base_url}{i}') soup = BeautifulSoup(page.content, 'html.parser') company_only = soup.select(".field-content .skiptranslate") company = company + [cm.get_text().strip() for cm in company_only] types_only = soup.select(".views-field-nothing .field-content") types = types + [tp.get_text().strip() for tp in types_only] data = pd.DataFrame({ 'Name': company, 'Type | Location | Date': types# 'Type | Location | Data': types }) data.to_csv(r'finalfile.csv', index=False) I then counted the lines in the file: cat finalfile.csv | wc -l 3104 The website was reporting 3103 Companies, plus the headers in the csv file, it's correct. Then I counted the unique lines in the file: cat finalfile.csv | sort -u | wc -l 3091 Some companies are repeated so I printed the difference: cat finalfile.csv | sort | uniq -d Banco Amazonas S.A.,Banks | Americas and the Caribbean | Ecuador | 09 May 2019 Careem,Software & Computer Services | Arab States | Qatar | 13 May 2018 Careem,Software & Computer Services | Asia and the Pacific | Pakistan | 13 May 2018 Hong Kong Exchanges and Clearing Limited,"Financial Services | Asia and the Pacific | China, Hong Kong SAR |" H?TAY PLAZA,General Retailers | Europe and Central Asia | Turkey | 06 March 2019 "Kowa Co., Ltd.",Health Care Equipment & Services | Asia and the Pacific | Japan | 17 September 2010 Madrigal Sports,General Industrials | Asia and the Pacific | Pakistan | 05 December 2017 Novartis Corporativo S.A. de C.V.,Health Care Providers | Global | Mexico | 07 February 2020 Poppins Corporation,Support Services | Asia and the Pacific | Japan | 17 September 2010 Procter & Gamble Japan K.K.,Food & Drug Retailers | Asia and the Pacific | Japan | 17 September 2010 "Shiseido Co., Ltd.",Personal Goods | Asia and the Pacific | Japan | 17 September 2010 Tesco PLC,Food & Drug Retailers | Europe and Central Asia | United Kingdom of Great Britain and Northern Ireland | 06 March 2019 Xiaohongshu,Internet | Asia and the Pacific | China | 05 March 2020 I repeated running the script and bash commands and got the same result. So I conclude that the 3103 Companies listed on the website have duplicates on the website and there are none missing from the results. Just to check I searched for the keyword "Careem" and got duplicated results.
Filling out a website form using Python requests
I'm trying to programmatically fill out a form on a page using Python requests. I wrote some code to do that: #!/usr/bin/python import requests URL = 'https://www.acgov.org/ptax_pub_app/RealSearch.do' payload = { 'displayApn': '1-123-1', 'showHistory': 'y', } s = requests.session() r = s.post(URL, data=payload) print r.status_code print r.cookies print r.text However, the output isn't coming out as expected. The status code returned is 200 The cookies are printing out as <RequestsCookieJar[]> And the text of the response has html headers but it's just a bunch of jumbled up javascript: <!DOCTYPE html> <html><head> <meta http-equiv="Pragma" content="no-cache"/> <meta http-equiv="Expires" content="-1"/> <meta http-equiv="CacheControl" content="no-cache"/> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <link rel="shortcut icon" href="data:;base64,iVBORw0KGgo="/> <script> (function(){ window["bobcmn"] = "111110101010102000000022000000052000000012744f9810200000096300000021application/x-www-form-urlencoded300000000300000006/TSPD/300000008TSPD_101300000005https3000000b008ae96f08bab2000f746485dcaefc4a635c0beff477f241b9355c916986257756d516313dd184676085e51d6fb0a280088bb71708ecac997cbd3b91abf62403b987812f208f2d2cfcb59631333f545e4de4c55cc4d2f00b230000002ashowHistory%3dy%26displayApn%3d1%2d123%2d1200000000"; window.yfma=!!window.yfma;try{(function(){(function(){})();var _s=59;try{var js,ls,Os=S(840)?0:1,zs=S(798)?0:1,sS=S(200)?1:0,SS=S(659)?0:1,_S=S(223)?1:0,LS=S(478)?1:0;for(var OS=(S(787),0);OS<ls;++OS)Os+=(S(125),2),zs+=(S(260),2),sS+=S(567)?2:1,SS+=(S(515),2),_S+=(S(835),2),LS+=(S(127),3);js=Os+zs+sS+SS+_S+LS;window.lJ===js&&(window.lJ=++js)}catch(S_){window.lJ=js}var __=!0;function I(s,_){s+=_;return s.toString(36)} function I_(s){var _=53;!s||document[l(_,171,158,168,158,151,158,161,158,169,174,136,169,150,169,154)]&&document[L(_,171,158,168,158,151,158,161,158,169,174,136,169,150,169,154)]!==I(68616527613,_)||(__=!1);return __}function l(s){var _=arguments.length,J=[];for(var z=1;z<_;++z)J.push(arguments[z]-s);return String.fromCharCode.apply(String,J)}function j_(){}I_(window[j_[L(_s,169,156,168,160)]]===j_);I_(typeof ie9rgb4!==l(_s,161,176,169,158,175,164,170,169)); I_(RegExp("\x3c")[I(1372146,_s)](function(){return"\x3c"})&!RegExp(l(_s,179,110,159))[I(1372146,_s)](function(){return"'x3'+'d';"})); var l_=window[L(_s,156,175,175,156,158,163,128,177,160,169,175)]||RegExp(l(_s,168,170,157,164,183,156,169,159,173,170,164,159),I(-41,_s))[L(_s,175,160,174,175)](window["\x6e\x61vi\x67a\x74\x6f\x72"]["\x75\x73e\x72A\x67\x65\x6et"]),O_=+new Date+(S(33)?6E5:615140),Z_,Si,ii,Ii=window[l(_s,174,160,175,143,164,168,160,170,176,175)],Ji=l_?S(99)?3E4:21582:S(85)?6E3:5497; document[L(_s,156,159,159,128,177,160,169,175,135,164,174,175,160,169,160,173)]&&document[L(_s,156,159,159,128,177,160,169,175,135,164,174,175,160,169,160,173)](l(_s,177,164,174,164,157,164,167,164,175,180,158,163,156,169,162,160),function(s){var _=48;document[l(_,166,153,163,153,146,153,156,153,164,169,131,164,145,164,149)]&&(document[l(_,166,153,163,153,146,153,156,153,164,169,131,164,145,164,149)]===I(1058781935,_)&&s[L(_,153,163,132,162,165,163,164,149,148)]?ii=!0:document[L(_,166,153,163,153, 146,153,156,153,164,169,131,164,145,164,149)]===I(68616527618,_)&&(Z_=+new Date,ii=!1,Li()))});function L(s){var _=arguments.length,J=[],z=1;while(z<_)J[z-1]=arguments[z++]-s;return String.fromCharCode.apply(String,J)}function Li(){if(!document[l(39,152,156,140,153,160,122,140,147,140,138,155,150,153)])return!0;var s=+new Date;if(s>O_&&(S(386)?6E5:758599)>s-Z_)return I_(!1);var _=I_(Si&&!ii&&Z_+Ji<s);Z_=s;Si||(Si=!0,Ii(function(){Si=!1},S(477)?1:0));return _}Li(); var oi=[S(626)?17972802:17795081,S(388)?27611931586:2147483647,S(830)?1862183071:1558153217];function Zi(s){var _=11;s=typeof s===l(_,126,127,125,116,121,114)?s:s[L(_,127,122,94,127,125,116,121,114)](S(475)?36:48);var J=window[s];if(!J[L(_,127,122,94,127,125,116,121,114)])return;var z=""+J;window[s]=function(s,_){Si=!1;return J(s,_)};window[s][l(_,127,122,94,127,125,116,121,114)]=function(){return z}}for(var sI=(S(965),0);sI<oi[L(_s,167,160,169,162,175,163)];++sI)Zi(oi[sI]); I_(!1!==window[L(_s,180,161,168,156)]);window.Jl={oL:"089e4a9f79017800e36ff59ba1e5d6d5e1f93b16b5b458d18a09540515a45f4c2fa1cb5ea167a407bc42c2be8a0eeaf8c16869b5dd03a199749963ce5b01e899032b244489e7c78f8618c6a53a224b50de13cacbe6346167e00de073de7b15625d0451b8a5cd04cb0895c8cb503536a54c9e0c5e860626b71fc398289ea1aada"};function iI(s){var _=+new Date,J;!document[l(48,161,165,149,162,169,131,149,156,149,147,164,159,162,113,156,156)]||_>O_&&(S(347)?6E5:514364)>_-Z_?J=I_(!1):(J=I_(Si&&!ii&&Z_+Ji<_),Z_=_,Si||(Si=!0,Ii(function(){Si=!1},S(468)?1:0)));return!(arguments[s]^J)}function S(s){return 568>s} (function(){var s=/(\A([0-9a-f]{1,4}:){1,6}(:[0-9a-f]{1,4}){1,1}\Z)|(\A(([0-9a-f]{1,4}:){1,7}|:):\Z)|(\A:(:[0-9a-f]{1,4}){1,7}\Z)/ig,_=document.getElementsByTagName("head")[0],J=[];_&&(_=_.innerHTML.slice(0,1E3));while(_=s.exec(""))J.push(_)})();})();}catch(x){ }finally{ie9rgb4=void(0);};function ie9rgb4(a,b){return a>>b>>0}; })(); </script> <script type="text/javascript" src="/TSPD/08ae96f08bab2000d96246327d838c6fa30bb9c4f41390f6fbd80de23adbed5ac22558a0c0007168?type=7"></script> <noscript>Please enable JavaScript to view the page content.<br/>Your support ID is: 183979068942220394.</noscript> </head><body> </body></html> That's obviously not what I want. I wanna get the contents of the page that renders when I submit the form manually on the browser. After some browser inspection, when I send the form manually the following request headers are being posted to the server: POST /ptax_pub_app/RealSearch.do HTTP/1.1 Host: www.acgov.org User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:66.0) Gecko/20100101 Firefox/66.0 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 Accept-Language: en-US,en;q=0.5 Accept-Encoding: gzip, deflate, br Referer: https://www.acgov.org/ptax_pub_app/RealSearch.do Content-Type: multipart/form-data; boundary=---------------------------5784378851470632262085445332 Content-Length: 304 Connection: keep-alive Cookie: TS744f9810_75=TS744f9810_rc=1&TS744f9810_id=2&TS744f9810_cr=08ae96f08bab280047871c302267d274621ba715eb672bba8c4e6326721d39c4e9275ba2573dd8ecb04e5fd2ed8b14de:08e8846af6032000365890ddfe7c40338b1c71881c3aa160e9b7511f898e727042a17ecd4e549128&TS744f9810_ef=&TS744f9810_pg=0&TS744f9810_ct=application/x-www-form-urlencoded&TS744f9810_bg=08ae96f08bab20007ed7e7334af2c3a0ddc2a737a8f76402a06229c2abec9c180de6732a86a9648608ba63d37c0a28007e212e36225cb10a4cd776ce268b7178b1d33e9bc0271ac4819eb499a739f93571208168c1d71d9c&TS744f9810_rf=https%3a%2f%2fwww.acgov.org%2fptax_pub_app%2fRealSearchInit.do%3fshowSearchParmsFromLookup%3dtrue; _ga=GA1.2.1302812812.1549499581; TSPD_101=08ae96f08bab280047871c302267d274621ba715eb672bba8c4e6326721d39c4e9275ba2573dd8ecb04e5fd2ed8b14de:; JSESSIONID=0000Im6xKN_53mKz4Iw5KNO5gR0:16hgu6tbb; TS01ed31ee=0129191c7e5fb1688bfcca5087fec2a194712c77706b9ba0027f29d8162a79cfc6c4aefe2136c8ca6d34cd2a1622154e5765f831e0e88ce369724f44b0e9f3ebe5c827a6011131434eedec5e04b97f4977a6091f7d; TS01ed31ee_77=08ae96f08bab2800dd88029ca6fb0fa267ec2a5e40e37cef6351b9876c3e34f6bb42cae44bc0afadbb819ab098f6e9b408de561ace82400034a3a6b4be45a224cb4595200fc21d5c6f05b9f72090ad9bf8cf1db9cef92af4944728ce98cc9906ca77cf3a81dbe502fadd7ae968c030f5b7e5f37a743d021e; ASP.NET_SessionId=db12w03jxf5pelnstiyf35jh; _gid=GA1.2.879815811.1551480793 Upgrade-Insecure-Requests: 1 Pragma: no-cache Cache-Control: no-cache I doubt my code is sending all of those headers. I'm not even sure what some of them mean or how I could replicate that in my script. Any ideas?
You are simply missing a single element the site is looking for when you post a request; when you actually use the intended form page, the form includes a submit button: <input type="submit" name="searchBills" tabindex="9" value="Search" class="btcommon"> You need to include that button in your POST data, because it is the presence of that field that the site uses to detect that you made an actual search: payload = { 'displayApn': '1-123-1', 'showHistory': 'y', 'searchBills': 'Search', } With that one addition, the returned page contains the looked-for search results: >>> import requests >>> from bs4 import BeautifulSoup >>> URL = 'https://www.acgov.org/ptax_pub_app/RealSearch.do' >>> payload = { ... 'displayApn': '1-123-1', ... 'showHistory': 'y', ... 'searchBills': 'Search', ... } >>> response = requests.post(URL, data=payload) >>> soup = BeautifulSoup(response.content, 'lxml') >>> for row in soup.select('#pplresultcontent3 tr'): ... text = row.get_text(': ', strip=True) ... if text: print(text) ... Property Summary APN: 1-123-1 Property Address: 424 M L KING JR WAY, OAKLAND 94607-3536 >>> for row in soup.select('#pplresultcontent4 tr'): ... text = row.get_text(' | ', strip=True) ... if text: print(text) ... Tax Type | Bill Year | Tracer | Total Amount | Options Installment | Due Date | Installment Amount | Status/Status Date Secured | 2018-2019 | 01009500 | $8,773.64 | View Bill | Pay Bill 1st Installment | 12/10/2018 | $4,386.82 | Paid Oct 31, 2018 2nd Installment | 04/10/2019 | $4,386.82 The history (the pplresultcontent5 table) is not included until you use a capital Y for the showHistory option: >>> payload['showHistory'] = 'Y' >>> response = requests.post(URL, data=payload) >>> soup = BeautifulSoup(response.content, 'lxml') >>> for row in soup.select('#pplresultcontent5 tr'): ... text = row.get_text(' | ', strip=True) ... if text: print(text) ... Tax Type | Bill Year | Tracer | Total Amount | Options Installment | Due Date | Installment Amount | Status/Status Date Secured | 2017-2018 | 01009500 | $8,303.42 | View Bill 1st Installment | 12/10/2017 | $4,151.71 | Paid Dec 8, 2017 2nd Installment | 04/10/2018 | $4,151.71 | Paid Apr 6, 2018 Secured | 2016-2017 | 01009500 | $7,983.02 | View Bill 1st Installment | 12/10/2016 | $3,991.51 | Paid Dec 8, 2016 2nd Installment | 04/10/2017 | $3,991.51 | Paid Mar 30, 2017 Secured | 2015-2016 | 01009400 | $7,864.14 | View Bill 1st Installment | 12/10/2015 | $3,932.07 | Paid Dec 9, 2015 2nd Installment | 04/10/2016 | $3,932.07 | Paid Apr 8, 2016 Secured | 2014-2015 | 01009400 | $7,691.52 | View Bill 1st Installment | 12/10/2014 | $3,845.76 | Paid Dec 10, 2014 2nd Installment | 04/10/2015 | $3,845.76 | Paid Apr 7, 2015 Secured | 2013-2014 | 01009400 | $7,655.08 | View Bill 1st Installment | 12/10/2013 | $3,827.54 | Paid Dec 4, 2013 2nd Installment | 04/10/2014 | $3,827.54 | Paid Apr 9, 2014 Secured | 2012-2013 | 01009400 | $6,102.96 | View Bill 1st Installment | 12/10/2012 | $3,051.48 | Paid Dec 7, 2012 2nd Installment | 04/10/2013 | $3,051.48 | Paid Apr 8, 2013 Secured | 2011-2012 | 01009400 | $6,213.30 | View Bill 1st Installment | 12/10/2011 | $3,106.65 | Paid Dec 9, 2011 2nd Installment | 04/10/2012 | $3,106.65 | Paid Apr 10, 2012 Secured | 2010-2011 | 01069800 | $5,660.56 | View Bill 1st Installment | 12/10/2010 | $2,830.28 | Paid Dec 9, 2010 2nd Installment | 04/10/2011 | $2,830.28 | Paid Apr 10, 2011 Secured | 2009-2010 | 01070300 | $5,917.10 | View Bill 1st Installment | 12/10/2009 | $2,958.55 | Paid Dec 10, 2009 2nd Installment | 04/10/2010 | $2,958.55 | Paid Apr 10, 2010 Secured | 2008-2009 | 01070300 | $5,547.66 | View Bill 1st Installment | 12/10/2008 | $2,773.83 | Paid Dec 10, 2008 2nd Installment | 04/10/2009 | $2,773.83 | Paid Apr 10, 2009 Secured | 2007-2008 | 01069100 | $5,423.06 | View Bill 1st Installment | 12/10/2007 | $2,711.53 | Paid Dec 10, 2007 2nd Installment | 04/10/2008 | $2,711.53 | Paid Apr 10, 2008 Secured | 2006-2007 | 01069000 | $5,387.94 | View Bill 1st Installment | 12/10/2006 | $2,693.97 | Paid Dec 10, 2006 2nd Installment | 04/10/2007 | $2,693.97 | Paid Apr 10, 2007 Secured | 2005-2006 | 01069100 | $5,243.04 | View Bill 1st Installment | 12/10/2005 | $2,621.52 | Paid Dec 9, 2005 2nd Installment | 04/10/2006 | $2,621.52 | Paid Apr 10, 2006 Secured | 2004-2005 | 01068900 | $4,855.00 | View Bill 1st Installment | $2,427.50 | Paid Dec 10, 2004 2nd Installment | $2,427.50 | Paid Apr 10, 2005