I can't figure out why I get a blank output file - python

import csv
import requests
import re
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#CREATE CSV FILE
outfile = open("./output.csv", "wb")
writer = csv.writer(outfile)
#IMPORT MATCHES
import csv
with open('matches.csv', 'rb') as f:
reader = csv.reader(f)
matches = list(reader)
for id in matches:
id = str(id)
id = re.sub("[^0-9]","",id)
url = 'http://www.virtualpronetwork.com/apps/fvpaa/matches/match_report/' + id
print (url)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#GET TEAMS AND SCORES
score = soup.findAll("div",{"class":"col-md-5 center"})
team_home = score[0]
team_home = str(team_home)
team_home = re.search('title="(.*)" />',team_home)
team_home = team_home.group(1)
team_away = score[1]
team_away = str(team_away)
team_away = re.search('title="(.*)" />',team_away)
team_away = team_away.group(1)
goals_home = score[2]
goals_home = str(goals_home)
goals_home = re.sub('</h2></div>','',goals_home)
goals_home = re.sub('<div class="col-md-5 center"><h2>','',goals_home)
goals_away = score[3]
goals_away = str(goals_away)
goals_away = re.sub('</h2></div>','',goals_away)
goals_away = re.sub('<div class="col-md-5 center"><h2>','',goals_away)
#GET HOME STATS
tables = soup.findChildren('table')
stats_home = tables[0]
list_of_rows_home = []
for row in stats_home.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_home.append(list_of_cells)
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
cell = list_of_rows_home[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_home:
del row[2]
for i in range(len(list_of_rows_home)):
row = list_of_rows_home[i]
row.append(team_home)
row.append(goals_home)
row.append(team_away)
row.append(goals_away)
#GET AWAY STATS
stats_away = tables[1]
list_of_rows_away = []
for row in stats_away.findChildren('tr')[1:]:
list_of_cells = []
for cell in row.findChildren('td')[0]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[1]:
text = cell.text
list_of_cells.append(text)
for cell in row.findChildren('td')[2:]:
list_of_cells.append(cell)
list_of_rows_away.append(list_of_cells)
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
cell = list_of_rows_away[i][2]
cell = str(cell)
goal = re.findall('goal',cell)
goal = goal.count('goal')
goal = goal / 2
assist = re.findall('assist',cell)
assist = assist.count('assist')
assist = assist / 2
motm = re.findall('motm',cell)
motm = motm.count('motm')
row.append(goal)
row.append(assist)
row.append(motm)
for row in list_of_rows_away:
del row[2]
for i in range(len(list_of_rows_away)):
row = list_of_rows_away[i]
row.append(team_away)
row.append(goals_away)
row.append(team_home)
row.append(goals_home)
#COMPILE INTO ONE TABLE
list_of_rows = list_of_rows_home + list_of_rows_away
#WRITE TO CSV
writer.writerows(list_of_rows)
My input file is a basic excel file with the match id's all lined up in column one of the excel file. When it creates the output file, it's blank. I am not getting any error messages either.

The issue is in your regex search, so perhaps change it to:
team_home = re.search('title="(.*)"',team_home)
team_home = team_home.group(1)
Alternative:
team_home = re.search('title="(.*)"/>',team_home)
team_home = team_home.group(1)
The /> is not needed, and this essentially makes title="" not match for group(1), which in turn creates an Attribute Error, and the script stops. If you want to include /> then remove the space in your regex pattern, since that is ultimately what kills it.

Related

JSONDecodeError: Expecting value: line 1 column 1 (char 0) when using Pushift API to scrape Reddit Data

import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image

Split data into multiple worksheets

Since I am going to create a number of dataframes I know won't fit inside just a single google worksheet (because of the limitation of columns) I want to split the data into multiple worksheets. I'm using set_with_dataframe() and defining which worksheet the dataframes is going to get imported to, so my first thought was to create and define several worksheets and then use the same method - the problem is just that I don't know how to "split" the data when there's no more columns in the first worksheet (and then the second, and the third and so on...)
I'm quite new at working with Python and I have been stuck with this for days so any kind of help would be appreciated.
My code looks like this:
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
import traceback
import os
class DataScraper():
def __init__(self, sheets):
self.data_worksheet = sheets.data_worksheet
self.total_urls = 0
self.urls = self.getAllUrls(sheets.url_worksheet)
def getAllUrls(self, urlWorkSheet):
urls = urlWorkSheet.get_all_values()
finalUrls = []
for r in urls:
# Get all urls
modifiedUrls = [d for d in r[:14] if "https://" in d]
if len(modifiedUrls) != 0:
self.total_urls += len(modifiedUrls)
finalUrls.append(modifiedUrls)
return finalUrls
def StartScrape(self):
current_column_count = 1
last_data_frame_max_width = 0
current_element = 0
for urlRow in self.urls:
current_row_count = 1
for url in urlRow:
current_element += 1
error = False
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
labels = []
results = []
tbl = soup.find('table')
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(self.data_worksheet, df, col=current_column_count, row=current_row_count, include_column_header=False)
current_row_count += df.shape[0]+2
if df.shape[1] > last_data_frame_max_width:
last_data_frame_max_width = df.shape[1]
except Exception:
error = True
finally:
print(f"Processed page {current_element}/{self.total_urls} with status: {'success' if not error else 'error'}")
current_column_count += last_data_frame_max_width+5
last_data_frame_max_width = 0
class Sheets():
def __init__(self, filename, key):
self.filename = filename
self.key = key
self.data_worksheet = None
self.url_worksheet = None
self.getSheets(self.getCredentials())
def getCredentials(self):
# sep = seperator
_ = os.path.normpath(__file__).split(os.sep)
_.insert(1, "/")
credentials = service_account.Credentials.from_service_account_file(os.path.join(os.path.join(*_[0:-1]), self.filename))
return credentials.with_scopes( ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
def getSheets(self, scoped_credentials):
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key(self.key)
# Get sheet with data import
self.data_worksheet = spreadsheet_key.worksheet("Data")
# Get list with url's
self.url_worksheet = url_worksheet = spreadsheet_key.worksheet("Felix Copy")
# Get sheets
sheets = Sheets("credentials.json", "key_id")
# Start scraping
scraper = DataScraper(sheets)
scraper.StartScrape()

Separate the elements I extracted from a web page by columns and print them in csv with python

I am starting to learn Python, I have this code which works fine for web scraping, I already have the info that I want and I am sending that data to a CSV but it prints all the text in just one cell.
Can you help me to fix it so I can print each element in a different column?
This is my code:
from bs4 import BeautifulSoup
import csv
with open('output_file_name', 'w', newline='') as csv_file:
writer3 = csv.writer(csv_file, delimiter=';')
file4 = open('hola4.csv', 'w', newline='')
writer4 = csv.writer(file4)
class Table:
def __init__(self, driver):
self.driver = driver
def get_column_info(self):
column_info = []
columns = self.driver.find_elements_by_xpath("/html/body/div[1]/main/div[2]/div[3]/div/div/div[5]/div[2]/table/thead/tr/th")
for column in columns:
column_info.append(str(column.text.replace("%","")))
writer2.writerow([column_info])
return column_info
def get_results(self, index=None):
columns = self.get_column_info()
data = {}
elements = self.driver.find_elements_by_xpath("//div[#id = 'resumen_mensual']/table/tbody[#id = 'body_tmes' ]/tr[contains(#class, 'ini')]{}"
.format("[{}]".format(index) if index else ""))
for elementos in elements:
prueba = elementos.text.strip()
for element in elements:
current_index = elements.index(element) + 1 if not index else index
parsed_data = {}
for column in columns:
value = element.find_element_by_xpath("//div[#id = 'resumen_mensual']/table/tbody[#id = 'body_tmes' ]/tr[contains(#class, 'ini')][{}]"
"/td[{}]"
.format(current_index,columns.index(column) + 1)).text
parsed_data.update({column: str(value)})
data.update({current_index: parsed_data})
return data
def get_number_of_results(self):
return len(self.driver.find_elements_by_xpath("//div[#id = 'resumen_mensual']/table/tbody[#id = 'body_tmes' ]/tr[contains(#class, 'ini')]"))
if "__main__" == __name__:
table = Table(driver)
writer4.writerow([table.get_column_info()])
writer3.writerow([table.get_results()])
table = Table(driver)
print(table.get_column_info())
I have this as a result if I run it:
['DÍA', 'T. MEDIA', 'T. MÁX', 'T. MÍN', 'V. MEDIA VIENTO', 'RACHAS MÁX', 'PRESIÓN MEDIA', 'LLUVIA']
and in csv:
I don't think you need writer2.writerow([column_info]).
Set delimiters to \t (delimiter='\t').
Instead of:
writer4.writerow([table.get_column_info()])
writer3.writerow([table.get_results()])
do:
for info in table.get_column_info():
writer4.writerow(info)
for result in table.get_results():
writer3.writerow(result)

Extract data from web page

I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).

Returning a row that matches specified condition, and edit particular columns in row. Then write to csv file with changed row

I'm writing a python script that works with two csv files. Lets call them csv1.csv (original file to read) and csv2.csv (exact copy of csv1). The goal is to find the row and column in the csv file that corresponds to the the modified user-defined input.
csv format:(continues for about 2-3 thousand lines)
record LNLIM, ID_CO,OD_DV,ID_LN, ST_LN, ZST_LN, ID_LNLIM,LIMIT1_LNLIM, LIMIT2_LNLIM, LIMIT3_LNLIM
LNLIM, 'FPL', 'SOUT', '137TH_LEVEE_B', 'B', '137TH_AV', 'LEVEE', 'A', 1000, 1100, 1200
LNLIM, 'FPL', 'SOUT', '137TH_DAVIS_B', 'A', '137TH_AV', 'NEWTON', 'A', 1000, 1100, 1200
...
Let's say that the user is looking for 137TH_AV and NEWTON. I want to be able to go row by row and compare the two columns/row indices ST_LN and ZST_LN. If both columns match what the user inputted then I want to capture which row in the csv file that happened on, and use that information to edit the remaining columns LIMIT1_LNLIM LIMIT2_LNLIM LIMIT3_LNLIM on that row with new analog values.
I want to get the 3 new values provided by the user and edit a specific row, and a specific row element. Once I've found the place to replace the number values I want to overwrite csv2.csv with this edit.
Determining where the line segment is located in the array
import sys
import csv
import os
import shutil
LineSectionNames = []
ScadaNames = []
with open('Vulcan_Imp_Summary.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
LineSectionName = row[1]
ScadaName = row[29]
LineSectionNames.append(LineSectionName)
ScadaNames.append(ScadaName)
#Reformatting arrays for accurate references
LineSectionNames = [character.replace('\xa0', ' ') for character in LineSectionNames]
LineSectionNames = [character.replace('?', '-') for character in LineSectionNames]
ScadaNames = [character.replace('\xa0', ' ') for character in ScadaNames]
#Setting Line Section name as key and Scada name as value
ScadaDict = {}
for i in range(len(LineSectionNames)):
ScadaDict[LineSectionNames[i]] = ScadaNames[i]
#Prompt user for grammatical name of Line Section
print ('Enter the Line Section Name: (Example = Goulds-Princeton) \n')
user_input = input()
#Reference user input to dictionary value to convert input into SCADA format
def reformat():
print ('Searching for Line Section...' + user_input)
if user_input in ScadaDict:
value = ScadaDict[user_input]
print ('\n\t Match!\n')
else:
print ('The Line Section name you have entered was incorrect. Try again. \n Example = Goulds-Princeton')
reformat()
# Copying the exported file from Genesys
path = 'I://PSCO//DBGROUP//PatrickL//'
shutil.copyfile(path + 'lnlim_import.csv', path + 'lnlim_import_c.csv')
#Using the SCADA format to search through csv file
print ('Searching csv file for...' + user_input)
# Reading the copied file
record_lnlims = []
id_cos = []
id_dvs = []
id_lines = []
id_lns = []
st_lns = []
zst_lns = []
id_lnlims = []
limit1_lnlims = []
limit2_lnlims = []
limit3_lnlims = []
with open('lnlim_import_c.csv', 'r') as copy:
reader = csv.reader(copy)
for row in reader:
record_lnlim = row[0]
id_co = row[1]
id_dv = row[2]
id_line = row[3]
id_ln = row[4]
st_ln = row[5]
zst_ln = row[6]
id_lnlim = row[7]
limit1_lnlim = row[8]
limit2_lnlim = row[9]
limit3_lnlim = row[10]
record_lnlims.append(record_lnlim)
id_cos.append(id_co)
id_dvs.append(id_dv)
id_lines.append(id_line)
id_lns.append(id_ln)
st_lns.append(st_ln)
zst_lns.append(zst_ln)
id_lnlims.append(id_lnlim)
limit1_lnlims.append(limit1_lnlim)
limit2_lnlims.append(limit2_lnlim)
limit3_lnlims.append(limit3_lnlim)
#Reformatting the user input from GOULDS-PRINCETON to 'GOULDS' and 'PRINCETON'
input_split = user_input.split('-', 1)
st_ln1 = input_split[0]
zst_ln1 = input_split[1]
st_ln2 = st_ln1.upper()
zst_ln2 = zst_ln1.upper()
st_ln3 = "'" + str(st_ln2) + "'"
zst_ln3 = "'" + str(zst_ln2) + "'"
#Receiving analog values from user
print ('\n\t Found! \n')
print ('Enter the Specified Emergency Rating (A) for 110% for 7 minutes: ')
limit1_input = input()
print ('Enter the Specified Emergency Rating (A) for 120% for 7 minutes: ')
limit2_input = input()
print ('Enter the Specified Emergency Rating (A) for 130% for 5 minutes: ')
limit3_input = input()
Whenever I print the row_index it prints the initialized value of 0.
i = 0
row_index = 0
for i in range(len(st_lns)):
if st_ln3 == st_lns[i] and zst_ln3 == zst_lns[i]:
row_index = i
print(row_index)
limit1_input = limit1_lnlims[row_index]
limit2_input = limit2_lnlims[row_index]
limit3_input = limit3_lnlims[row_index]
csv_list = []
csv_list.append(record_lnlims)
csv_list.append(id_cos)
csv_list.append(id_dvs)
csv_list.append(id_lines)
csv_list.append(st_lns)
csv_list.append(zst_lns)
csv_list.append(id_lnlims)
csv_list.append(limit1_lnlims)
csv_list.append(limit2_lnlims)
csv_list.append(limit3_lnlims)
#Editing the csv file copy to implement new analog values
with open('lnlim_import_c.csv', 'w') as edit:
for x in zip(csv_list):
edit.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\n".format(x))

Categories