Webscraper suddenly stopped working. Potential issues with cloudfare?

Webscraper suddenly stopped working. Potential issues with cloudfare? - python

I've built a simple webscraper below that scrapes some information from the site https://www.thewhiskyexchange.com/new-products/standard-whisky every minute or so.
It's been working fine up until today and has suddenly stopped working. Changing to
product in soup.select('a'):
prints out:
[Chrome Web Store, Cloudflare]
Could this be an authentication issue caused by Cloudfare? Is there a way around this?
Full code:
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = []
url = 'https://www.thewhiskyexchange.com/new-products/standard-whisky/'
response = requests.get(url,headers=headers)
bottles = []
link = []
product_name_old = []
link2 = []
link3 = []
soup = BeautifulSoup(response.text,features="html.parser")
oldlinks = []
product_name_old = []
for product in soup.select('li.product-grid__item'):
product_name_old.append(product.a.attrs['title'])
oldlinks.append(product.a.attrs['href'])
product_size_old = len(product_name_old)
print("Setup Complete", product_size_old)
link4 = "\n".join("{}\nhttps://www.thewhiskyexchange.com{}".format(x, y) for x, y in zip(product_name_old, oldlinks))
print(link4)

import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
}
async def main(url):
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(x['title'].strip(), url[:33]+x['href'])
for x in soup.select('.product-card')]
df = pd.DataFrame(goal, columns=['Title', 'Link'])
print(df)
if __name__ == "__main__":
trio.run(main, 'https://www.thewhiskyexchange.com/new-products/standard-whisky/')
Output:
Title Link
0 Macallan 18 Year Old Sherry Oak 2020 Release https://www.thewhiskyexchange.com/p/56447/maca...
1 Benriach The Thirty 30 Year Old https://www.thewhiskyexchange.com/p/60356/benr...
2 Maker's Mark Kentucky Mule Cocktail Kit https://www.thewhiskyexchange.com/p/61132/make...
3 Isle of Raasay Single Malt https://www.thewhiskyexchange.com/p/60558/isle...
4 Caol Ila 2001 19 Year Old Exclusive to The Whi... https://www.thewhiskyexchange.com/p/61099/caol...
.. ... ...
75 MB Roland Single Barrel Bourbon https://www.thewhiskyexchange.com/p/60403/mb-r...
76 Seven Seals The Age of Scorpio https://www.thewhiskyexchange.com/p/60373/seve...
77 Seven Seals The Age of Aquarius https://www.thewhiskyexchange.com/p/60372/seve...
78 Langatun 2016 Pedro Ximenez Sherry Cask Finish https://www.thewhiskyexchange.com/p/60371/lang...
79 Speyburn 2009 11 Year Old Sherry Cask Connoiss... https://www.thewhiskyexchange.com/p/60411/spey...
[80 rows x 2 columns]

Related

Can't get all results in tripadvisor using python al beautifulsoup due to pagination

I am trying to get links of restaurants but i can only get the first 30 and not all the others.
Restaurants in Madrid Area are hundreads, the pagination only shows 30 in each page and the following code only get those 30
import re
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup as b
city_name = 'Madrid'
geo_code = '187514'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
data = requests.get(
"https://www.tripadvisor.com//Restaurants-g{}-{}.html".format(geo_code, city_name), headers=headers
).text
for link in re.findall(r'"detailPageUrl":"(.*?)"', data):
print("https://www.tripadvisor.com.sg/" + link)
next_link = "https://www.tripadvisor.com.sg/" + link
f.write('%s\n' % next_link)

Found the solution, had to add ao with number of the result in the url like:
"https://www.tripadvisor.com//Restaurants-g{}-{}-{}.html".format(geo_code, city_name, n_review), headers=headers

urlopen Returning Redirect Error for Valid Links [HTTP Error 308: Permanent Redirect]

I'm trying to scrape the amazon listings, I am consistently getting a redirect error with my scraper. I even used the http.cookiejar.CookieJar and a urllib.request.HTTPCookieProcessor to avoid the redirect loop but still getting the error.
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib
import time
import requests, random
from requests.exceptions import HTTPError
from socket import error as SocketError
from http.cookiejar import CookieJar
data =[]
def getdata (url):
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
]
user_agent = random.choice(user_agents)
header_ = {'User-Agent': user_agent}
req = urllib.request.Request(url, headers=header_)
cj = CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
response = opener.open(req)
amazon_html = response.read().decode('utf8', errors='ignore')
a_soup = soup(amazon_html,'html.parser')
cat = k
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
asin = e.find('a')['href'].replace('dp%2F', '/dp/').split('/dp/')[1].replace('%2','/ref').split('/ref')[0]
except:
asin = 'No ASIN Found'
try:
title = e.find('h2').text
except:
title = None
data.append({
'Category': cat,
'ASIN': asin,
'Title':title
})
return a_soup
def getnextpage(a_soup):
try:
page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'http://www.amazon.in'+ str(page)
except:
url = None
return url
keywords = ['headphone','mobile','router','smartwatch']
for k in keywords:
url = 'https://www.amazon.in/s?k='+k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
Output
HTTPError: HTTP Error 308: Permanent Redirect
Error Screenshot
Any ideas how I can correct this ?

I am scraping Html table they show me the error 'AttributeError: 'NoneType' object has no attribute 'select''

I am scraping Html table they show me the error 'AttributeError: 'NoneType' object has no attribute 'select' try to solve it
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
soup = BeautifulSoup(r.content, "lxml")
table = soup.find('table',attrs={'style':"border"})
all_data = []
for row in table.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in row.select("td")]
all_data.append(tds)
df = pd.DataFrame(all_data, columns=header)
print(df)

It appears that website you are trying to scrape blocks the requests sent by requests library. To deal with the issue, I used Selenium library which automates the website browsing. The code below collects the titles given in the table.
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
browser = webdriver.Chrome()
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
soup = BeautifulSoup(browser.page_source, "lxml")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
all_data = [i.text.strip() for i in soup.select("option")]
df = pd.DataFrame(all_data, columns=["Titles"])
print(df)
Output:
Titles
0 Agree Realty Corporation (ADC)
1 American Campus Communities, Inc. (ACC)
2 Antero Midstream Corporation (AM)
3 Antero Resources Corporation (AR)
4 Apache Corp. (APA)
.. ...
126 W. P. Carey Inc. (WPC)
127 Washington Real Estate Investment Trust (WRE)
128 Welltower Inc. (WELL)
129 Western Midstream Partners, LP (WES)
130 Whiting Petroleum Corporation (WLL)
If you have not used Selenium before, do not forget to install chromedriver.exe and add it to the PATH environment variable. You can also give the location of the driver to the constructor manually.
Updated code to extract extra information
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
browser = webdriver.Chrome()
browser.get("https://capitalonebank2.bluematrix.com/sellside/Disclosures.action")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"}
for title in browser.find_elements_by_css_selector('option'):
title.click()
time.sleep(1)
browser.switch_to.frame(browser.find_elements_by_css_selector("iframe")[1])
table = browser.find_element_by_css_selector("table table")
soup = BeautifulSoup(table.get_attribute("innerHTML"), "lxml")
all_data = []
ratings = {"BUY":[], "HOLD":[], "SELL":[]}
lists_ = []
for row in soup.select("tr")[-4:-1]:
info_list = row.select("td")
count = info_list[1].text
percent = info_list[2].text
IBServ_count = info_list[4].text
IBServ_percent = info_list[5].text
lists_.append([count, percent, IBServ_count, IBServ_percent])
ratings["BUY"] = lists_[0]
ratings["HOLD"] = lists_[1]
ratings["SELL"] = lists_[2]
print(ratings)
browser.switch_to.default_content()

Grouping results from Python Beautifulsoup extracted table data for more readability

The following snippet is working but for the purpose of readability, I need help in formatting the result into screen.
from urllib.request import Request, urlopen,urljoin
from bs4 import BeautifulSoup
import re, random, ctypes
import requests
from time import sleep
url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]
header = random.choice(user_agent_list)
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
rows = soup.findAll('table')[0].findAll('tr')
for row in rows[1:]:
tds = row.find_all('td')
txnhash = tds[1].text[0:]
value = tds[7].text[0:]
token = tds[8].text[0:]
link = urljoin(url, tds[8].find('a')['href'])
print(str(link)[26:] +"\t" + str(token) + "\t\t" + str(value))
Current Output:
0x154a9f9cbd3449ad22fdae23044319d6ef2a1fab CryptoBlades... (SKILL) 0
0x46d502fac9aea7c5bc7b13c8ec9d02378c33d36f WolfSafePoor... (WSPP) 532,654,321,110
0xb510e39a6cc3ebe999ff957ae7b5813d3326af88 GoldenBresco (GoBo) 0.1
0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c Wrapped BNB (WBNB) 0.193446389516094066
0xb510e39a6cc3ebe999ff957ae7b5813d3326af88 GoldenBresco (GoBo) 0.003
Wanted Improvement: # grouping into 3 columns
0x154a9f9cbd3449ad22fdae23044319d6ef2a1fab CryptoBlades... (SKILL) 2.746949883778173559
CryptoBlades... (SKILL) 0.971749999999999991
CryptoBlades... (SKILL) 0
0xbb4cdb9cbd36b01bd1cbaebf2de08d9173bc095c Wrapped BNB (WBNB) 0.1
Wrapped BNB (WBNB) 0.193446389516094066
Wrapped BNB (WBNB) 0.3

Try:
import requests
from bs4 import BeautifulSoup
from itertools import groupby
url = "https://bscscan.com/tokentxns"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = []
for tr in soup.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in tr.select("td")]
_, txn_hash, tm, age, from_, _, to_, value, token = tds
data.append((txn_hash, token, value))
data = sorted(data)
for _, g in groupby(data, lambda k: k[0]):
g = list(map(list, g))
for subl in g[1:]:
subl[0] = ""
for subl in g:
print("{:<67} {:<27} {:<20}".format(*subl))
print()
Prints:
0x0883f7ada1e30d266366577dbc46cd86a8deb737d669758a443ef03859ea551a FEGtoken (FEG) 1,946,201,644.40754275
Wrapped BNB (WBNB) 0.025356409113673479
0x41a7e28aa1f88522ba477718f9ea93d927bd8c456cd77c75691d961ac01da626 KOMOCOIN (KMC) 1,500
KOMOCOIN (KMC) 750
0x54bf03ddb42a151920fc2352a8419ed24720422b79c4956c74ab1d51aead142e BABY CAKE (BABYCA...) 140.806276687606518422
BABY CAKE (BABYCA...) 165.654443161890021673
BABY CAKE (BABYCA...) 2,164,578.319665288243959287
BABY CAKE (BABYCA...) 238.930554998160499529
BABY CAKE (BABYCA...) 42.164215587910676387
BABY CAKE (BABYCA...) 462,482.805614060076081865
BABY CAKE (BABYCA...) 797.902234563103604395
BABY CAKE (BABYCA...) 938.708511250710122817
BABY CAKE PR...(BBCAKE...) 190,322,532.495690243057683413
BABY CAKE PR...(BBCAKE...) 2,526,729.458161278746350005
BABY CAKE PR...(BBCAKE...) 251,979.604709746169304594
BABY CAKE PR...(BBCAKE...) 252,609.914806456810514054
BABY CAKE PR...(BBCAKE...) 36,251,910.951560046296701602
BABYCAKE_Div...(BABYCA...) 238.930554998160499529
Pancake LPs (Cake-L...) 0.222139817418176568
Pancake LPs (Cake-L...) 13.786493105169560097
Pancake LPs (Cake-L...) 486.96534350290155168
Pancake LPs (Cake-L...) 5.76850094907955108
PancakeSwap ...(Cake) 0.001286990618481616
PancakeSwap ...(Cake) 0.112893929385320841
PancakeSwap ...(Cake) 1.497338191475435628
PancakeSwap ...(Cake) 61.821404790611192339
PancakeSwap ...(Cake) 61.821404790611192339
Wrapped BNB (WBNB) 0.000146050638113703
Wrapped BNB (WBNB) 0.000146050638113703
Wrapped BNB (WBNB) 0.00146079350317574
Wrapped BNB (WBNB) 0.109629866733835175
Wrapped BNB (WBNB) 0.610745057130530703
Wrapped BNB (WBNB) 2.850122532653068215
0x6cc6153aa387de6a56c905f7d424ec38f047fefdcc2b7d766c53db7807b6f562 CryptoBlades...(SKILL) 0.005999999999999999
CryptoBlades...(SKILL) 0.06
0x776a1edc9446cc3e160cb08a69e2824dab0e6df7b6c79f252a1c9a0de4733bd4 Arena Token (ARENA) 0.000802589119468346
Arena Token (ARENA) 0.037402597402597402
Arena Token (ARENA) 0.374025974025974025
0x7ca15e96d56d686d79a93271e192021fefed01187dce424bec835f1a6a47b937 CryptoBlades...(SKILL) 0.971749999999999991
0x7f6bada297def57a2d1823000d464923187bea376c5747ba6ebe0b63b1ae1850 CryptoBlades...(SKILL) 0
0x8ddaceff011648b2f13128c8ce4ff5654171878200e12f2ce8f9cf3ec4ab97a3 CryptoBlades...(SKILL) 0.051999999999999999
CryptoBlades...(SKILL) 0.52
0x91d299dc263ac4e30027c5e54e5a5fd4fd2fb814db7c0fc00643764f8710e47b CryptoBlades...(SKILL) 0
0xa097fad173e3d6551e2a837048f40348ffcafc710ca13410de1fb532f2833ba7 Niubi Token (NIU) 2,152.08364390963091904
Wrapped BNB (WBNB) 0.05
0xf2c10ec09049cd810c3aac459b85b9bbbcbb53f3b78341d24af1cab585d6e1ba Foxy Equilib...(Foxy) 0.9
Foxy Equilib...(Foxy) 0.9
Foxy Equilib...(Foxy) 7.2
0xf5b44e82e4e4509d59b51491ce1bfa44888fae2c11a65bd5021d2aed9c75afd4 CryptoBlades...(SKILL) 0.055005280975673767
Wrapped BNB (WBNB) 0.022533425242910644
EDIT: To print token URL instead of name:
import requests
from bs4 import BeautifulSoup
from itertools import groupby
url = "https://bscscan.com/tokentxns"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = []
for tr in soup.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in tr.select("td")]
_, txn_hash, tm, age, from_, _, to_, value, token = tds
a = "https://bscscan.com" + tr.select("a")[-1]["href"]
data.append((txn_hash, a, value))
data = sorted(data)
for _, g in groupby(data, lambda k: k[0]):
g = list(map(list, g))
for subl in g[1:]:
subl[0] = ""
for subl in g:
print("{:<67} {:<27} {:<20}".format(*subl))
print()

I didn't get a response from you regarding the pastebin so here is the approach I was giving of simply styling a dataframe. I said in the comments that it seems you really just want to order by the first column and then not repeat items within that column. You can do that sort_values(), and using duplicated() to replace duplicates with ''. I've borrowed Andrej's (upvoted) tidier syntax for populating the list of lists.
You can style the dataframe as you see fit. I hid the borders between cells and set the background to white for example.
import pandas as pd
from bs4 import BeautifulSoup
import requests, random
url = 'https://bscscan.com/tokentxns'
user_agent_list = [
"header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0Gecko/20100101 Firefox/86.0'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'}",
"header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'}",
"header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'}"
]
header = random.choice(user_agent_list)
req = requests.get(url,header, timeout=10)
soup = BeautifulSoup(req.content, 'html.parser')
df_rows = []
for tr in soup.select("tr:has(td)"):
tds = [td.get_text(strip=True) for td in tr.select("td")]
_, txnhash, tm, age, from_, _, to_, value, token = tds
df_rows.append([txnhash, token, value])
df = pd.DataFrame(df_rows, columns = ['hash', 'token', 'value'])
df['value'] = pd.to_numeric(df['value'].apply(lambda x: x.replace(',','')))
df.sort_values(['hash', 'token'], inplace = True)
df.hash = [i[1] if not i[0] else '' for i in zip(df.duplicated(subset=['hash']), df.hash)]
#df.reset_index(drop = True, inplace = True)
df.style.format(formatter={('value'): "{:,.3f}"}).hide_index() \
.set_properties(**{'background-color': 'white', 'text-align': 'left'}, padding="10px", border='0px solid white') \
.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

Printing Text Scraped Using BeautifulSoup to Pandas Dataframe without Tags

I have been working on the code below and getting myself tied up in knots. What I am trying to do is build a simple dataframe using text scraped using BeautifulSoup.
I have scraped the applicable text from the <h5> and <p> tags but using find_all means that when I build the dataframe and write to csv the tags are included. To deal with this I have added the print(p.text, end=" ") statements but now nothing is being written to the csv.
Can anyone see what I am doing wrong?
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
course = []
runner = []
page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
tips = soup.find('div', class_='sticky')
for h5 in tips.find_all("h5"):
course_name = print(h5.text, end=" ")
course.append(course_name)
for p in tips.find_all("p"):
runner_name = print(p.text, end=" ")
runner.append(runner_name)
todays_tips = pd.DataFrame(
{'Course': course,
'Selection': runner,
})
print(todays_tips)
todays_tips.to_csv(r'C:\Users\*****\Today.csv')

Don't use the assignment for print and consider using a list comprehension. Applying this should get you the dataframe you want.
For example:
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}
page = requests.get('https://www.attheraces.com/tips/atr-tipsters/hugh-taylor', headers=headers)
tips = BeautifulSoup(page.content, 'html.parser').find('div', class_='sticky')
course = [h5.getText() for h5 in tips.find_all("h5")]
runner = [p.getText() for p in tips.find_all("p")]
todays_tips = pd.DataFrame({'Course': course, 'Selection': runner})
print(todays_tips)
todays_tips.to_csv("your_data.csv", index=False)
Output:
Course Selection
0 1.00 HAYDOCK 1pt win RAINBOW JET (12-1 & 11-1 general)
1 2.50 GOODWOOD 1pt win MARSABIT (11-2 general)
And a .csv file:

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Webscraper suddenly stopped working. Potential issues with cloudfare? - python

Related

Can't get all results in tripadvisor using python al beautifulsoup due to pagination

urlopen Returning Redirect Error for Valid Links [HTTP Error 308: Permanent Redirect]

I am scraping Html table they show me the error 'AttributeError: 'NoneType' object has no attribute 'select''

Grouping results from Python Beautifulsoup extracted table data for more readability

Printing Text Scraped Using BeautifulSoup to Pandas Dataframe without Tags

Categories

Resources