having problems downloading a csv file - python

So I was trying to make a function that downloads a csv file using the csv download link and then basically prints it dividing it in lines but I'm having problems when I have to save
def download_data(csv_url):
response = request.urlopen(csv_url)
csv = response.read()
csv_str = str(csv)
lines = csv_str.split("\\n")
dest_url = r'data.csv'
fx = open(dest_url, 'r')
for line in lines:
fx.write(line + '/n')
fx.close()
when I give it the csv link , it tells me it can't find file/directory "data.csv" even though I should've downloaded it.
Running Mac os

You're reading the file. Change the 'r' in fx = open(dest_url, 'r') to 'w'.
fx = open(dest_url, 'w')
As a side note you really should be using a with statement. with will make the file object close the connection once the code leaves the the with's scope. This way you don't have to worry about closing the connection.
def download_data(csv_url):
response = request.urlopen(csv_url)
with open('data.csv', 'w') as f:
f.write(str(response.read()))
Though really there isn't any need to save the file at all if you're just going to read it and display the contents on the screen. Just have download_data return csv_str.
Finally take a look at the builtin csv module. It makes life easy.
import csv
from io import StringIO
import requests
def download_data(csv_url):
return csv.reader(
StringIO(
requests.get(csv_url)
.text
), delimiter=','
)
for row in download_data('https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv'):
print("| {} |".format(str(' | '.join(row))))
# Prints:
#
# | John | Doe | 120 jefferson st. | Riverside | NJ | 08075 |
# | Jack | McGinnis | 220 hobo Av. | Phila | PA | 09119 |
# | John "Da Man" | Repici | 120 Jefferson St. | Riverside | NJ | 08075 |
# | Stephen | Tyler | 7452 Terrace "At the Plaza" road | SomeTown | SD | 91234 |
# | | Blankman | | SomeTown | SD | 00298 |
# | Joan "the bone", Anne | Jet | 9th, at Terrace plc | Desert City | CO | 00123 |

Related

Unable to fetch the entire data from kafka topic to cassandra using python

I want the data from MySQL to Cassandra in real time using Apache Kafka.
Here is my producer code in python
import json
from kafka import KafkaProducer
import pymysql.cursors
producer = KafkaProducer(bootstrap_servers=['localhost:9092'])
connection = pymysql.connect(host='127.0.0.1',
user='root',
port=3306,
password='Mysql#123',
db='bank_transaction',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
cursor = connection.cursor()
sql = "Select * from transactions"
cursor.execute(sql)
rows = cursor.fetchall()
data = ""
for row in rows:
producer.send('demo', json.dumps(row).encode("utf-8"))
cursor.close()
connection.close()
# configure multiple retries
producer = KafkaProducer(retries=5)
This is the sample output I'm getting after running a simple kafka consumer
{u'status': u'INITIATED', u'stped': u'STP', u'remark': u'adkaDKA',
u'cr_cust_id': 4321, u'txn_Id': u'FT123456', u'currency': u'USD',
u'dr_cust_type': u'Retail', u'dr_cust_id': 1234, u'bank_user': u'FO
MAKER', u'txn_start_date_time': u'3/1/2022 8:00', u'txn_code':
u'FT001', u'dept': u'FRONT OFFICE', u'txn_end_date_time': u'3/1/2022
8:30', u'source': u'Mobile', u'amount': 1000, u'dr_cust_acct':
1234567890, u'txn_Type': u'Fund Transfer', u'dr_cust_name': u'Vimal',
u'cr_cust_name': u'Vivek', u'cr_cust_type': u'Retail',
u'cr_cust_acct': 987654321} {u'status': u'INITIATED', u'stped':
u'STP', u'remark': u'adkaDKA', u'cr_cust_id': 4321, u'txn_Id':
u'FT123456', u'currency': u'USD', u'dr_cust_type': u'Retail',
u'dr_cust_id': 1234, u'bank_user': u'FO CHECKER',
u'txn_start_date_time': u'3/1/2022 8:00', u'txn_code': u'FT001',
u'dept': u'FRONT OFFICE', u'txn_end_date_time': u'3/1/2022 8:30',
u'source': u'Mobile', u'amount': 1000, u'dr_cust_acct': 1234567890,
u'txn_Type': u'Fund Transfer', u'dr_cust_name': u'Vimal',
u'cr_cust_name': u'Vivek', u'cr_cust_type': u'Retail',
u'cr_cust_acct': 987654321}
Here is the consumer code to store the data from kafka topic to cassandra
from encodings import utf_8
from kafka import KafkaConsumer
import json
from cassandra.cluster import Cluster
from cassandra.policies import DCAwareRoundRobinPolicy
def interpret_constant(c):
try:
if str(int(c)) == c: return int(c)
except ValueError:
pass
try:
if str(float(c)) == c: return float(c)
except ValueError:
return c
cluster = Cluster()
session = cluster.connect('test')
print("After connecting to kafka")
consumer = KafkaConsumer('demo',
group_id='my-group',
bootstrap_servers=['localhost:9092'])
def insert(message):
msg = message.value.decode('utf-8')
#print(msg)
msg = json.loads(msg)
print(msg)
keys = ",".join(msg.keys())
values = ','.join(str(v) for v in msg.values())
user_insert_stmt = session.prepare("insert into response ({0}) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)".format(keys))
#msg = json.loads(message.message.value.decode('utf-8'))
#msg = message.message.value.split("|")
#print(msg)
new_msg = [interpret_constant(x) for x in msg.values()]
#print(new_msg)
return session.execute(user_insert_stmt,new_msg)
for message in consumer:
insert (message)
After running the above code, I'm only getting 9 rows of data in cassandra (it should be 49). And the int data type values not showing in cassandra
select * from response;
txn_id | amount | bank_user | cr_cust_acct | cr_cust_id | cr_cust_name | cr_cust_type | currency | dept | dr_cust_acct | dr_cust_id | dr_cust_name | dr_cust_type | remark | source | status | stped | txn_code | txn_end_date_time | txn_start_date_time | txn_type
-----------+--------+------------+--------------+------------+--------------+--------------+----------+--------------+--------------+------------+--------------+--------------+---------+---------------+-----------+-------+----------+-------------------+---------------------+-----------------
AC123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | HNI-VIP | adkaDKA | Mobile | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | ACCT TO ACCT
TT123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | SME | adkaDKA | IB | COMPLETED | STP | TT0001 | 3/1/2022 8:30 | 3/1/2022 8:00 | CHEQUE DEPOSIT
FT123456 | null | SYS | null | null | Vivek | Retail | USD | SYSTEM | null | null | Vimal | Retail | adkaDKA | BRANCH MANUAL | COMPLETED | NSTP | | 3/1/2022 8:30 | 3/1/2022 8:00 | Fund Transfer
FT1234567 | null | FO MAKER | null | null | Kunal | Retail | USD | FRONT OFFICE | null | null | Manan | Retail | adkaDKA | Mobile | INITIATED | STP | TT0001 | 3/1/2022 8:30 | 3/1/2022 8:00 | Fund Transfer
FTM12345 | null | BO CHECKER | null | null | Vivek | CORP | USD | BACK OFFICE | null | null | Vimal | CORP | adkaDKA | Mobile | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | DIRECT DEBIT
FD123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | CORP | adkaDKA | BRANCH MANUAL | COMPLETED | STP | FD0001 | 3/1/2022 8:30 | 3/1/2022 8:00 | FIXED DEPOSIT
MC123456 | null | BO CHECKER | null | null | Vivek | HNI-VIP | USD | BACK OFFICE | null | null | Vimal | HNI-VIP | adkaDKA | IB | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | MANAGERS CHEQUE
RR123456 | null | BO CHECKER | null | null | Vivek | Retail | USD | BACK OFFICE | null | null | Vimal | Retail | adkaDKA | IB | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | CHARGE REVERSAL
CHG123456 | null | BO CHECKER | null | null | Vivek | SME | USD | BACK OFFICE | null | null | Vimal | SME | adkaDKA | BRANCH MANUAL | COMPLETED | STP | | 3/1/2022 8:30 | 3/1/2022 8:00 | CHARGE POSTING
Cassandra table description
CREATE TABLE test.response (
txn_id text PRIMARY KEY,
amount int,
bank_user text,
cr_cust_acct int,
cr_cust_id int,
cr_cust_name text,
cr_cust_type text,
currency text,
dept text,
dr_cust_acct int,
dr_cust_id int,
dr_cust_name text,
dr_cust_type text,
remark text,
source text,
status text,
stped text,
txn_code text,
txn_end_date_time text,
txn_start_date_time text,
txn_type text
) WITH additional_write_policy = '99p'
AND bloom_filter_fp_chance = 0.01
AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
AND cdc = false
AND comment = ''
AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
AND crc_check_chance = 1.0
AND default_time_to_live = 0
AND extensions = {}
AND gc_grace_seconds = 864000
AND max_index_interval = 2048
AND memtable_flush_period_in_ms = 0
AND min_index_interval = 128
AND read_repair = 'BLOCKING'
AND speculative_retry = '99p';
Python version 2.7, Kafka version 2.7.0, Cassandra version4 4.0.1.
What changes to make if I want the entire data from kafka topic to Cassandra?

SAS Programming: How to replace missing values in multiple columns using one column?

Background
I have a large dataset in SAS that has 17 variables of which four are numeric and 13 character/string. The original dataset that I am using can be found here: https://www.kaggle.com/austinreese/craigslist-carstrucks-data.
cylinders
condition
drive
paint_color
type
manufacturer
title_status
model
fuel
transmission
description
region
state
price (num)
posting_date (num)
odometer (num)
year (num)
After applying specific filters to the numeric columns, there are no missing values for each numeric variable. However, there are thousands to hundreds of thousands of missing variables for the remaining 14 char/string variables.
Request
Similar to the blog post towards data science as shown here (https://towardsdatascience.com/end-to-end-data-science-project-predicting-used-car-prices-using-regression-1b12386c69c8), specifically under the Feature Engineering section, how can I write the equivalent SAS code where I use regex on the description column to fill missing values of the other string/char columns with categorical values such as cylinders, condition, drive, paint_color, and so on?
Here is the Python code from the blog post.
import re
manufacturer = '(gmc | hyundai | toyota | mitsubishi | ford | chevrolet | ram | buick | jeep | dodge | subaru | nissan | audi | rover | lexus \
| honda | chrysler | mini | pontiac | mercedes-benz | cadillac | bmw | kia | volvo | volkswagen | jaguar | acura | saturn | mazda | \
mercury | lincoln | infiniti | ferrari | fiat | tesla | land rover | harley-davidson | datsun | alfa-romeo | morgan | aston-martin | porche \
| hennessey)'
condition = '(excellent | good | fair | like new | salvage | new)'
fuel = '(gas | hybrid | diesel |electric)'
title_status = '(clean | lien | rebuilt | salvage | missing | parts only)'
transmission = '(automatic | manual)'
drive = '(4x4 | awd | fwd | rwd | 4wd)'
size = '(mid-size | full-size | compact | sub-compact)'
type_ = '(sedan | truck | SUV | mini-van | wagon | hatchback | coupe | pickup | convertible | van | bus | offroad)'
paint_color = '(red | grey | blue | white | custom | silver | brown | black | purple | green | orange | yellow)'
cylinders = '(\s[1-9] cylinders? |\s1[0-6]? cylinders?)'
keys = ['manufacturer', 'condition', 'fuel', 'title_status', 'transmission', 'drive','size', 'type', 'paint_color' , 'cylinders']
columns = [ manufacturer, condition, fuel, title_status, transmission ,drive, size, type_, paint_color, cylinders]
for i,column in zip(keys,columns):
database[i] = database[i].fillna(
database['description'].str.extract(column, flags=re.IGNORECASE, expand=False)).str.lower()
database.drop('description', axis=1, inplace= True)
What would be the equivalent SAS code for the Python code shown above?
It's basically just doing a word search of sorts.
A simplified example in SAS:
data want;
set have;
array _fuel(*) $ _temporary_ ("gas", "hybrid", "diesel", "electric");
do i=1 to dim(_fuel);
if find(description, _fuel(i), 'it')>0 then fuel = _fuel(i);
*does not deal with multiple finds so the last one found will be kept;
end;
run;
You can expand this by creating an array for each variable and then looping through your lists. I think you can replace the loop with a REGEX command as well in SAS but regex requires too much thinking so someone else will have to provide that answer.

Scrapy handle missing path

I am building a forum-scraper for a university project. The page of the forum that I am using is the following: https://www.eurobricks.com/forum/index.php?/forums/topic/163541-lego-ninjago-2019/&tab=comments#comment-2997338.
I am able to extract all the information that I need except for the location. This information is stored inside the following path.
<li class="ipsType_light"> <\li>
<span class="fc">Country_name<\span>
The problem is that sometimes this information and path does not exist. But my actual solution can not handle it.
Here the code I wrote to get the information about the location.
location_path = "//span[#class='fc']/text()"
def parse_thread(self, response):
comments = response.xpath("//*[#class='cPost_contentWrap ipsPad']")
username = response.xpath(self.user_path).extract()
x = len(username)
if x>0:
score = response.xpath(self.score_path).extract()
content = ["".join(comment.xpath(".//*[#data-role='commentContent']/p/text()").extract()) for comment in comments]
date = response.xpath(self.date_path).extract()
location = response.xpath(self.location_path).extract()
for i in range(x):
yield{
"title": title,
"category": category,
"user": username[i],
"score": score[i],
"content": content[i],
"date": date[i],
"location": location[i]
}
One possible solution that I have tried was to check the length of the location but is not working.
Right now the code results in the following (sample data)
Title | category | test1 | 502 | 22 june 2020 | correct country
Title | category | test2 | 470 | 22 june 2020 | wrong country (it takes the next user country)
Title | category | test3 | 502 | 28 june 2020 | correct country
And what I would like to achieve is:
Title | category | test1 | 502 | 22 june 2020 | correct country
Title | category | test2 | 470 | 22 june 2020 | Not available
Title | category | test3 | 502 | 28 june 2020 | correct country
The solution to my problem is that instead of selecting the specific information one by one. First I have to select the entire block where all the pieces of information and only then pick the single information that I need.

Python program that reorganizes Excel formatting?

I am working on a Python program that aims to take Excel data that is vertical and make it horizontal.
For example, the data is shaped something like this:
County | State | Number | Date
Oakland | MI | 19 | 1/12/10
Oakland | MI | 32 | 1/19/10
Wayne | MI | 9 | 1/12/10
Wayne | MI | 6 | 1/19/10
But I want it like this (purposefully excluding the state):
County | 1/12/10 | 1/19/10
Oakland | 19 | 32
Wayne | 9 | 6
(And for the actual data, it’s quite long).
My logic so far:
Read in the Excel File
Loop through the counties
If county name is the same, place # in Row 1?
Make a new Excel File?
Any ideas of how to write this out? I think I am a little stuck on the syntax here.

Looking for alternative to Selenium for scraping multiple pages

I get the desired results but I think some the code could be improved. It's currently quite slow and error prone when scraping multiple pages in a row. The code below scrapes 5 features for 42 vehicles (21 per page). I'm scraping a total of 18 features (other 13 features are not shown here) for these two pages but it takes too long considering I wish to scrape a total of 29 pages.
In order to see the vehicle price you need to log in which is why I'm using Selenium as shown in the code below.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from selenium import webdriver
import time
from IPython.core.interactiveshell import InteractiveShell #optional
# Change cell settings (optional)
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = None
driver = webdriver.Chrome()
#driver.maximize_window() #optional
# Log in and search
urls = ["https://www.example.com/"]
for url in urls:
driver.get(url)
time.sleep(1)
driver.find_elements_by_class_name("dropdown-toggle")[0].click()
time.sleep(1)
driver.find_elements_by_name('email')[0].send_keys("arjenvgeffen#hotmail.com")
time.sleep(1)
driver.find_elements_by_name("submit")[0].click()
time.sleep(2)
link = driver.find_element_by_link_text('SEARCH')
time.sleep(1)
link.click()
time.sleep(2)
driver.find_elements_by_name("searchScope")[0].send_keys('ALL PAST')
time.sleep(1)
driver.find_elements_by_name("searchMake")[0].send_keys('PLYMOUTH')
time.sleep(1)
driver.find_elements_by_name('searchModel')[0].send_keys('Cuda')
time.sleep(1)
driver.find_elements_by_name('searchYearStart')[0].send_keys("1970")
time.sleep(1)
driver.find_elements_by_name('searchYearEnd')[0].send_keys("1971")
time.sleep(1)
driver.find_element_by_xpath("//button[. = 'Search']").click()
time.sleep(1)
The code below scrapes the vehicle title (year_make_model_type), price (which you can only see after loggin in above with email) and the page urls. The page_urls will be used in the next step to scrape information per product page. This takes too long when scraping 29 pages and it tends to skip/get stuck. Any improvement here is much appreciated!
# Scrape two pages (these two variables can be scraped without being on the vehicle page)
i = 0
x = 1
year_make_model_type = []
price = []
while True:
for i in range(0,1):
time.sleep(2)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
time.sleep(2)
urls = [x.get('href') for x in soup.findAll("a", class_ = "lot-title")]
time.sleep(2)
mystring = 'https://www.example.com'
page_urls = [mystring + s for s in urls]
time.sleep(2)
for y in soup.find_all("a", class_ = ("lot-title")):
year_make_model_type.append(y.text)
time.sleep(2)
for p in soup.find_all("span", class_ = ("lot-price")):
price.append(re.sub("[\$\,]", "", p.text))
time.sleep(2)
i +=1
for x in range(2,3):
time.sleep(5)
driver.find_element_by_xpath('//a[#href="/search/page/%d/"]' % (x,)).click()
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
time.sleep(2)
page_products_urls = [x.get('href') for x in soup.findAll("a", class_ = "lot-title")]
time.sleep(2)
mystring = 'https://www.example.com'
page_products_urls2 = [mystring + s for s in page_products_urls]
page_urls.extend(page_products_urls2)
time.sleep(2)
for y in soup.find_all("a", class_ = ("lot-title")):
year_make_model_type.append(y.text)
time.sleep(2)
for p in soup.find_all("span", class_ = ("lot-price")):
price.append(re.sub("[\$\,]", "", p.text))
time.sleep(2)
x += 1
if x == 2:
break
else:
break
len(page_urls) #42
len(set(page_urls)) #42
len(price) #42
len(set(price)) #36
len(year_make_model_type) #42
len(set(year_make_model_type)) #13
# If you need to go back to the first page
#driver.find_element_by_xpath('//a[#href="/search/page/1/"]').click()
# Create df
scraped_data = pd.DataFrame({'url': page_urls, 'year_make_model_type': year_make_model_type, 'price':price})
scraped_data['price'] = scraped_data['price'].replace('', np.NaN)
scraped_data['price'] = scraped_data['price'].astype(float)
scraped_data.shape
scraped_data.head()
#driver.quit()
This last bit of code scrapes the highlights and flag_group per vehicle from its product page.
# Create additional features per product url (have to click on product to be able to scrape these features)
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return(soup)
highlights = []
flag_group = []
# Add features per vehicle
for url in page_urls:
# Vehicle highlights
highlights1 = []
soup = getAndParseURL(url)
if not soup.find("ul", class_ = "lot-highlights hidden-print"):
highlights1.append(np.NaN)
else:
hl = soup.find("ul", class_ = "lot-highlights hidden-print").text.strip()
hl = hl.replace("\n", ", ").strip()
highlights1.append(hl)
highlights.extend(highlights1)
# Vehicle flag_group
attraction = []
soup = getAndParseURL(url)
flag = soup.find(class_=["flag flag-main","flag flag-star", "flag flag-feature"])
if flag:
attraction.append(flag.contents[0])
else:
attraction.append(np.NaN)
flag_group.extend(attraction)
# Assign new features to existing df
scraped_data = scraped_data.assign(**{'highlights': highlights, 'flag_group': flag_group})#, 'reserve': reserve})
scraped_data.shape
scraped_data.head()
Let me know/show me wherever you think the code above can be improved. Thanks for taking the time!
You Really Really don't need all this very long code at all.
You don't need even selenium.
You don't need to keep repeat your code and all this stuff.
Below should achieve your goal easily!
Note: I've scraped only the first 3 pages, You can increase the loop for your desired target.
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
data = {
"searchScope": "past",
"searchText": "PLYMOUTH",
"searchMake": "Plymouth",
"searchModel": "Cuda",
"searchYearStart": "1970",
"searchYearEnd": "1971",
"submit": ""
}
headers = {
"Referer": "https://www.mecum.com",
}
login = {"email": "arjenvgeffen#hotmail.com"}
def main(url):
with requests.Session() as req:
r = req.post(
"https://www.mecum.com/includes/login-action.cfm", data=login)
p = PrettyTable()
p.field_names = ["Name", "Url", "Price"]
for item in range(1, 4):
r = req.post(url.format(item), data=data, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("div.lot")
for tar in target:
price = tar.span.text if tar.span.text else "N/A"
hint = tar.select_one("a.lot-title")
p.add_row(
[hint.text, f"{url[:21]}{hint['href']}", price])
print(p)
main("https://www.mecum.com/search/page/{}/")
Output:
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
| Name | Url | Price |
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
| 1936 Plymouth Coupe | https://www.mecum.com/lots/HA0420-412309/1936-plymouth-coupe/ | N/A |
| 1937 Plymouth Deluxe Pickup | https://www.mecum.com/lots/HA0420-412385/1937-plymouth-deluxe-pickup/ | N/A |
| 1951 Plymouth Convertible | https://www.mecum.com/lots/HA0420-412744/1951-plymouth-convertible/ | N/A |
| 1968 Plymouth Road Runner | https://www.mecum.com/lots/HA0420-412874/1968-plymouth-road-runner/ | N/A |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/HA0420-413047/1970-plymouth-cuda/ | N/A |
| 1971 Plymouth Cuda Convertible | https://www.mecum.com/lots/HA0420-413138/1971-plymouth-cuda-convertible/ | N/A |
| 1968 Plymouth Road Runner | https://www.mecum.com/lots/HA0420-427812/1968-plymouth-road-runner/ | N/A |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-404226/1969-plymouth-road-runner/ | $19,250 |
| 1973 Plymouth Duster Police Car | https://www.mecum.com/lots/AZ0320-404232/1973-plymouth-duster-police-car/ | $18,700 |
| 1963 Plymouth Valiant Signet 200 Convertible | https://www.mecum.com/lots/AZ0320-404250/1963-plymouth-valiant-signet-200-convertible/ | $3,850 |
| 1946 Plymouth Taxi | https://www.mecum.com/lots/AZ0320-404267/1946-plymouth-taxi/ | $3,300 |
| 1969 Plymouth GTX | https://www.mecum.com/lots/AZ0320-404449/1969-plymouth-gtx/ | $25,000 |
| 1999 Plymouth Prowler | https://www.mecum.com/lots/AZ0320-404457/1999-plymouth-prowler/ | $20,000 |
| 1967 Plymouth Barracuda Formula S Fastback | https://www.mecum.com/lots/AZ0320-404478/1967-plymouth-barracuda-formula-s-fastback/ | $33,000 |
| 1970 Plymouth Cuda Convertible | https://www.mecum.com/lots/AZ0320-404626/1970-plymouth-cuda-convertible/ | $51,700 |
| 1967 Plymouth GTX | https://www.mecum.com/lots/AZ0320-404634/1967-plymouth-gtx/ | $31,350 |
| 1970 Plymouth Cuda Resto Mod | https://www.mecum.com/lots/AZ0320-404636/1970-plymouth-cuda-resto-mod/ | $50,000 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-404656/1969-plymouth-road-runner/ | $34,100 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/AZ0320-404858/1970-plymouth-cuda/ | $70,000 |
| 1970 Plymouth Superbird | https://www.mecum.com/lots/AZ0320-404866/1970-plymouth-superbird/ | $143,000 |
| 1967 Plymouth Satellite Convertible | https://www.mecum.com/lots/AZ0320-404883/1967-plymouth-satellite-convertible/ | $30,800 |
| 1970 Plymouth AAR Cuda | https://www.mecum.com/lots/AZ0320-404897/1970-plymouth-aar-cuda/ | $71,500 |
| 1967 Plymouth Barracuda Resto Mod | https://www.mecum.com/lots/AZ0320-404918/1967-plymouth-barracuda-resto-mod/ | $60,500 |
| 1969 Plymouth GTX Convertible | https://www.mecum.com/lots/AZ0320-404950/1969-plymouth-gtx-convertible/ | $42,000 |
| 1959 Plymouth Sport Fury | https://www.mecum.com/lots/AZ0320-404972/1959-plymouth-sport-fury/ | $30,000 |
| 1965 Plymouth Barracuda | https://www.mecum.com/lots/AZ0320-405120/1965-plymouth-barracuda/ | $22,000 |
| 1970 Plymouth Hemi Cuda | https://www.mecum.com/lots/AZ0320-405220/1970-plymouth-hemi-cuda/ | $150,700 |
| 1970 Plymouth Superbird | https://www.mecum.com/lots/AZ0320-405229/1970-plymouth-superbird/ | $115,000 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/AZ0320-405236/1970-plymouth-cuda/ | $52,500 |
| 1970 Plymouth Hemi Cuda | https://www.mecum.com/lots/AZ0320-405266/1970-plymouth-hemi-cuda/ | $130,000 |
| 1968 Plymouth Hemi Road Runner | https://www.mecum.com/lots/AZ0320-405267/1968-plymouth-hemi-road-runner/ | $70,000 |
| 1969 Plymouth Hemi Road Runner | https://www.mecum.com/lots/AZ0320-405286/1969-plymouth-hemi-road-runner/ | $62,000 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-405304/1969-plymouth-road-runner/ | $120,000 |
| 1959 Plymouth Sport Fury Convertible | https://www.mecum.com/lots/AZ0320-405321/1959-plymouth-sport-fury-convertible/ | $70,000 |
| 1973 Plymouth Cuda Resto Mod | https://www.mecum.com/lots/AZ0320-405340/1973-plymouth-cuda-resto-mod/ | $75,000 |
| 1969 Plymouth Sport Satellite Convertible | https://www.mecum.com/lots/AZ0320-405384/1969-plymouth-sport-satellite-convertible/ | $37,400 |
| 1970 Plymouth AAR Cuda | https://www.mecum.com/lots/AZ0320-405385/1970-plymouth-aar-cuda/ | $55,000 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-423532/1969-plymouth-road-runner/ | $60,500 |
| 1970 Plymouth Hemi Cuda | https://www.mecum.com/lots/AZ0320-423534/1970-plymouth-hemi-cuda/ | $93,500 |
| 1968 Plymouth Hemi Road Runner | https://www.mecum.com/lots/AZ0320-423535/1968-plymouth-hemi-road-runner/ | $66,000 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/AZ0320-423545/1970-plymouth-cuda/ | $60,000 |
| 1940s-50s Desoto Plymouth Double-Sided Porcelain 45x42 | https://www.mecum.com/lots/AZ0320-424465/1940s-50s-desoto-plymouth-double-sided-porcelain/ | $2,950 |
| 1940s-50s Dodge Plymouth Double-Sided Porcelain 42-in | https://www.mecum.com/lots/AZ0320-424468/1940s-50s-dodge-plymouth-double-sided-porcelain/ | $5,900 |
| 1940s-50s Chrysler Plymouth Double-Sided Porcelain 42-in | https://www.mecum.com/lots/AZ0320-424471/1940s-50s-chrysler-plymouth-double-sided-porcelain/ | $3,776 |
| 1969 Plymouth Road Runner | https://www.mecum.com/lots/AZ0320-424624/1969-plymouth-road-runner/ | $59,400 |
| 1965 Plymouth Sport Fury Convertible | https://www.mecum.com/lots/AZ0320-424629/1965-plymouth-sport-fury-convertible/ | $13,750 |
| 1970 Plymouth Road Runner Convertible | https://www.mecum.com/lots/AZ0320-428253/1970-plymouth-road-runner-convertible/ | $45,000 |
| 1970 Plymouth Barracuda Convertible | https://www.mecum.com/lots/AZ0320-428658/1970-plymouth-barracuda-convertible/ | $42,900 |
| 1966 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-394693/1966-plymouth-barracuda/ | $9,625 |
| 1965 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-394746/1965-plymouth-barracuda/ | $7,700 |
| 1969 Plymouth Satellite | https://www.mecum.com/lots/FL0120-394747/1969-plymouth-satellite/ | $3,850 |
| 1954 Plymouth Savoy | https://www.mecum.com/lots/FL0120-394753/1954-plymouth-savoy/ | $7,150 |
| 1952 Plymouth Police Car | https://www.mecum.com/lots/FL0120-394828/1952-plymouth-police-car/ | N/A |
| 1970 Plymouth Duster | https://www.mecum.com/lots/FL0120-394921/1970-plymouth-duster/ | $26,400 |
| 1965 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-394956/1965-plymouth-barracuda/ | $8,800 |
| 1950 Plymouth Special Deluxe | https://www.mecum.com/lots/FL0120-394983/1950-plymouth-special-deluxe/ | $8,250 |
| 1973 Plymouth Road Runner | https://www.mecum.com/lots/FL0120-395009/1973-plymouth-road-runner/ | $21,000 |
| 1970 Plymouth Road Runner | https://www.mecum.com/lots/FL0120-395013/1970-plymouth-road-runner/ | $51,700 |
| 1969 Plymouth Barracuda | https://www.mecum.com/lots/FL0120-395106/1969-plymouth-barracuda/ | $17,600 |
| 1966 Plymouth Satellite Convertible | https://www.mecum.com/lots/FL0120-395145/1966-plymouth-satellite-convertible/ | $26,400 |
| 1970 Plymouth Road Runner | https://www.mecum.com/lots/FL0120-395341/1970-plymouth-road-runner/ | $47,300 |
| 1970 Plymouth Cuda | https://www.mecum.com/lots/FL0120-395362/1970-plymouth-cuda/ | $61,000 |
| 1999 Plymouth Prowler Convertible | https://www.mecum.com/lots/FL0120-395647/1999-plymouth-prowler-convertible/ | $30,800 |
+----------------------------------------------------------+----------------------------------------------------------------------------------------------+----------+
I've edited Ahmed's code to get my desired output (pandas df)
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
data = {
"searchScope": "past",
"searchMake": "Plymouth",
"searchModel": "Cuda",
"searchYearStart": "1970",
"searchYearEnd": "1971",
"submit": ""
}
headers = {
"Referer": "https://www.example.com",
}
login = {"email": "example#hotmail.com"}
price = []
urls = []
title = []
results = []
def main(url):
with requests.Session() as req:
r = req.post(
"https://www.example.com/includes/login-action.cfm", data=login)
for item in range(1, 30):
r = req.post(url.format(item), data=data, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("div.lot")
for tar in target:
urls.append(tar.a.get('href'))
title.append(tar.select_one("a.lot-title").text)
price.append(tar.span.text if tar.span.text else np.NaN)
r = tar.select("div[class*=lot-image-container]")
for result in results2:
results.append(' '.join(result2['class']))
main("https://www.example.com/search/page/{}/")
scraped_data = pd.DataFrame({'url': urls, 'year_make_model_type': title, 'price':price, 'results': results})
scraped_data.shape
scraped_data["results"] = scraped_data["results"].str.replace("lot-image-container", "")
scraped_data["results"] = scraped_data["results"].replace('', np.NaN)
scraped_data.head()
Now I want to extract features from the list of product pages which are in the column 'url'. Below a working example but it's way too slow. I've tried fixing it with multiprocessing but I haven't figured it out yet. I want to extract about 10 more features for 500+ pages so it has to be faster than this.
low_url = ['https://www.mecum.com/lots/KC1210-101030/1970-plymouth-cuda/',
'https://www.mecum.com/lots/SC0510-91294/1970-plymouth-hemi-cuda/',
'https://www.mecum.com/lots/KC1210-100686/1970-plymouth-barracuda-convertible/',
'https://www.mecum.com/lots/KA0316-235834/1970-plymouth-barracuda-convertible/',
'https://www.mecum.com/lots/FL0110-88180/1970-plymouth-barracuda/']
reserve = []
with requests.Session() as req:
for url in low_url:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
attraction2 = []
if not soup.find(class_=["flag flag-no-reserve"]):
attraction2.append(np.NaN)
else:
r = soup.find(class_=["flag flag-no-reserve"])
attraction2.append(r.contents[0])
reserve.extend(attraction2)
len(reserve)
len(set(reserve))
reserve
Out: ['No Reserve', nan, nan, 'No Reserve', nan]

Categories