I have been using this code below to pull MLB lineups from BaseballPress.com. However this pulls the official MLB lineups which dont normally get posted until about an hour before the game.
import requests
import pandas as pd
import openpyxl
from bs4 import BeautifulSoup
url = "https://www.baseballpress.com/lineups/2022-08-09"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def get_name(tag):
if tag.select_one(".desktop-name"):
return tag.select_one(".desktop-name").get_text()
elif tag.select_one(".mobile-name"):
return tag.select_one(".mobile-name").get_text()
else:
return tag.get_text()
data = []
for card in soup.select(".lineup-card"):
header = [
c.get_text(strip=True, separator=" ")
for c in card.select(".lineup-card-header .c")
]
h_p1, h_p2 = [
get_name(p) for p in card.select(".lineup-card-header .player")
]
data.append([*header, h_p1, h_p2])
for p1, p2 in zip(
card.select(".col--min:nth-of-type(1) .player"),
card.select(".col--min:nth-of-type(2) .player"),
):
p1 = get_name(p1).split(maxsplit=1)[-1]
p2 = get_name(p2).split(maxsplit=1)[-1]
data.append([*header, p1, p2])
df = pd.DataFrame(
data, columns=["Team1", "Date", "Team2", "Player1", "Player2"]
)
df.to_excel("MLB Games.xlsx", sheet_name='sheet1', index=False)
print(df.head(10).to_markdown(index=False))
In order to get around this, I found out that Rotowire releases the projected lineups about 24 hours in advance which is what I need for this analysis. I have changed the python script to match the website, except I am not sure how to alter the get_name() tag. Does anyone know how I would address this portion of the code? See the new code below:
import requests
import pandas as pd
import openpyxl
from bs4 import BeautifulSoup
url = "https://www.rotowire.com/baseball/daily-lineups.php"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def get_name(tag):
if tag.select_one(".desktop-name"):
return tag.select_one(".desktop-name").get_text()
elif tag.select_one(".mobile-name"):
return tag.select_one(".mobile-name").get_text()
else:
return tag.get_text()
data = []
for card in soup.select(".lineup__main"):
header = [
c.get_text(strip=True, separator=" ")
for c in card.select(".lineup__teams .c")
]
h_p1, h_p2 = [
get_name(p) for p in card.select(".lineup__teams .lineup__player")
]
data.append([*header, h_p1, h_p2])
for p1, p2 in zip(
card.select(".lineup__list is-visit:nth-of-type(1) .lineup__player"),
card.select(".lineup__list is-home:nth-of-type(2) .lineup__player"),
):
p1 = get_name(p1).split(maxsplit=1)[-1]
p2 = get_name(p2).split(maxsplit=1)[-1]
data.append([*header, p1, p2])
df = pd.DataFrame(
data, columns=["Team1", "Date", "Team2", "Player1", "Player2"]
)
df.to_excel("MLB Predicted Lineups.xlsx", sheet_name='sheet1', index=False)
print(df.head(10).to_markdown(index=False))
You need to look at the actual html to see what tags and attributes the html source is using, in order to correctly identify the content you want. I had made a script to do this, what you are asking here, a while back, so I'm just using/posting that.
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def get_players(home_away_dict):
rows = []
for home_away, v in home_away_dict.items():
players = v['players']
print("\n{} - {}".format(v['team'],v['lineupStatus']))
for idx, player in enumerate(players):
if home_away == 'Home':
team = home_away_dict['Home']['team']
opp = home_away_dict['Away']['team']
else:
team = home_away_dict['Away']['team']
opp = home_away_dict['Home']['team']
if player.find('span', {'class':'lineup__throws'}):
playerPosition = 'P'
handedness = player.find('span', {'class':'lineup__throws'}).text
else:
playerPosition = player.find('div', {'class':'lineup__pos'}).text
handedness = player.find('span', {'class':'lineup__bats'}).text
if 'title' in list(player.find('a').attrs.keys()):
playerName = player.find('a')['title'].strip()
else:
playerName = player.find('a').text.strip()
playerRow = {
'Bat Order':idx,
'Name':playerName,
'Position':playerPosition,
'Team':team,
'Opponent':opp,
'Home/Away':home_away,
'Handedness':handedness,
'Lineup Status':home_away_dict[home_away]['lineupStatus']}
rows.append(playerRow)
print('{} {}'.format(playerRow['Position'], playerRow['Name']))
return rows
rows = []
url = 'https://www.rotowire.com/baseball/daily-lineups.php'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
lineupBoxes = soup.find_all('div', {'class':'lineup__box'})
for lineupBox in lineupBoxes:
try:
awayTeam = lineupBox.find('div', {'class':'lineup__team is-visit'}).text.strip()
homeTeam = lineupBox.find('div', {'class':'lineup__team is-home'}).text.strip()
print(f'\n\n############\n {awayTeam} # {homeTeam}\n############')
awayLineup = lineupBox.find('ul', {'lineup__list is-visit'})
homeLineup = lineupBox.find('ul', {'lineup__list is-home'})
awayLineupStatus = awayLineup.find('li', {'class':re.compile('lineup__status.*')}).text.strip()
homeLineupStatus = homeLineup.find('li', {'class':re.compile('lineup__status.*')}).text.strip()
awayPlayers = awayLineup.find_all('li', {'class':re.compile('lineup__player.*')})
homePlayers = homeLineup.find_all('li', {'class':re.compile('lineup__player.*')})
home_away_dict = {
'Home':{
'team':homeTeam, 'players':homePlayers, 'lineupStatus':homeLineupStatus},
'Away':{
'team':awayTeam, 'players':awayPlayers,'lineupStatus':awayLineupStatus}}
playerRows = get_players(home_away_dict)
rows += playerRows
except:
continue
df = pd.DataFrame(rows)
Output: First 20 of 300 rows
print(df.head(20).to_markdown(index=False))
| Bat Order | Name | Position | Team | Opponent | Home/Away | Handedness | Lineup Status |
|------------:|:-----------------|:-----------|:-------|:-----------|:------------|:-------------|:----------------|
| 0 | Nick Lodolo | P | CIN | PHI | Home | L | Expected Lineup |
| 1 | Jonathan India | 2B | CIN | PHI | Home | R | Expected Lineup |
| 2 | Nick Senzel | CF | CIN | PHI | Home | R | Expected Lineup |
| 3 | Kyle Farmer | 3B | CIN | PHI | Home | R | Expected Lineup |
| 4 | Joey Votto | 1B | CIN | PHI | Home | L | Expected Lineup |
| 5 | Aristides Aquino | DH | CIN | PHI | Home | R | Expected Lineup |
| 6 | Albert Almora | LF | CIN | PHI | Home | R | Expected Lineup |
| 7 | Matt Reynolds | RF | CIN | PHI | Home | R | Expected Lineup |
| 8 | Jose Barrero | SS | CIN | PHI | Home | R | Expected Lineup |
| 9 | Austin Romine | C | CIN | PHI | Home | R | Expected Lineup |
| 0 | Ranger Suarez | P | PHI | CIN | Away | L | Expected Lineup |
| 1 | Jean Segura | 2B | PHI | CIN | Away | R | Expected Lineup |
| 2 | Kyle Schwarber | LF | PHI | CIN | Away | L | Expected Lineup |
| 3 | Rhys Hoskins | 1B | PHI | CIN | Away | R | Expected Lineup |
| 4 | J.T. Realmuto | C | PHI | CIN | Away | R | Expected Lineup |
| 5 | Nick Castellanos | RF | PHI | CIN | Away | R | Expected Lineup |
| 6 | Alec Bohm | 3B | PHI | CIN | Away | R | Expected Lineup |
| 7 | Darick Hall | DH | PHI | CIN | Away | L | Expected Lineup |
| 8 | Bryson Stott | SS | PHI | CIN | Away | L | Expected Lineup |
| 9 | Matt Vierling | CF | PHI | CIN | Away | R | Expected Lineup |
Related
I'm trying to scrape the data from the table in the specifications section of this webpage:
Lochinvar Water Heaters
I'm using beautiful soup 4. I've tried searching for it by class - for example - (class="Table__Cell-sc-1e0v68l-0 kdksLO") but bs4 can't find the class on the webpage. I listed all the available classes that it could find and it doesn't find anything useful. Any help is appreciated.
Here's the code I tried to get the classes
import requests
from bs4 import BeautifulSoup
URL = "https://www.lochinvar.com/products/commercial-water-heaters/armor-condensing-water-heater"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("div", class_='Table__Wrapper-sc-1e0v68l-3 iFOFNW')
classes = [value
for element in soup.find_all(class_=True)
for value in element["class"]]
classes = sorted(classes)
for cass in classes:
print(cass)
The page is populated with javascript, but fortunately in this case, much of the data [including the specs table you want] seems to be inside a script tag within the fetched html. The script just has one statement, so it's fairly easy to extract it as json
import json
### copied from your q ####
import requests
from bs4 import BeautifulSoup
URL = "https://www.lochinvar.com/products/commercial-water-heaters/armor-condensing-water-heater"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
###########################
wrInf = soup.find(lambda l: l.name == 'script' and '__routeInfo' in l.text)
wrInf = wrInf.text.replace('window.__routeInfo = ', '', 1) # remove variable name
wrInf = wrInf.strip()[:-1] # get rid of ; at end
wrInf = json.loads(wrInf) # convert to python dictionary
specsTables = wrInf['data']['product']['specifications'][0]['table'] # get table (tsv string)
specsTables = [tuple(row.split('\t')) for row in specsTables.split('\n')] # convert rows to tuples
To view it, you could use pandas,
import pandas
headers = specsTables[0]
st_df = pandas.DataFrame([dict(zip(headers, r)) for r in specsTables[1:]])
# or just
# st_df = pandas.DataFrame(specsTables[1:], columns=headers)
print(st_df.head())
or you could simply print it
for i, r in enumerate(specsTables):
print(" | ".join([f'{c:^18}' for c in r]))
if i == 0: print()
output:
Model Number | Btu/Hr Input | Thermal Efficiency | GPH # 100ºF Rise | A | B | C | D | E | F | G | H | I | J | K | L | M | Gas Conn. | Water Conn. | Air Inlet | Vent Size | Ship. Wt.
AWH0400NPM | 399,000 | 99% | 479 | 45" | 24" | 30-1/2" | 42-1/2" | 29-3/4" | 20-1/4" | 12" | 20" | 38" | 3-1/2" | 10-1/2" | 19-1/4" | 20" | 1" | 2" | 4" | 4" | 326
AWH0500NPM | 500,000 | 99% | 600 | 45" | 24" | 30-1/2" | 42-1/2" | 29-3/4" | 20-1/4" | 12" | 20" | 38" | 3-1/2" | 10-1/2" | 19-1/4" | 20" | 1" | 2" | 4" | 4" | 333
AWH0650NPM | 650,000 | 98% | 772 | 45" | 24" | 41" | 53" | 30-1/2" | 15-1/4" | 12" | 20" | 38" | 3-1/2" | 10-1/2" | 19-1/4" | 20" | 1-1/4" | 2" | 4" | 6" | 424
AWH0800NPM | 800,000 | 98% | 950 | 45" | 24" | 41" | 53" | 30-1/2" | 15-1/4" | 12" | 20" | 38" | 3-1/2" | 10-1/2" | 19-1/4" | 20" | 1-1/4" | 2" | 4" | 6" | 434
AWH1000NPM | 999,000 | 98% | 1,187 | 45" | 24" | 48" | 62" | 30-1/2" | 15-3/4" | 12" | 20" | 38" | 3-1/2" | 10-1/2" | 19-1/4" | 20" | 1-1/4" | 2-1/2" | 6" | 6" | 494
AWH1250NPM | 1,250,000 | 98% | 1,485 | 51-1/2" | 34" | 49" | 59" | 5-1/2" | 5-1/2" | 13-1/2" | 6-3/4" | 46-3/4" | 5-3/4" | 19-3/4" | 23" | 22-1/2" | 1-1/2" | 2-1/2" | 8" | 8" | 1,568
AWH1500NPM | 1,500,000 | 98% | 1,782 | 51-1/2" | 34" | 52-3/4" | 62-3/4" | 4-1/2" | 4-1/2" | 13-1/2" | 6-3/4" | 46-3/4" | 5-3/4" | 19-3/4" | 23" | 22-1/2" | 1-1/2" | 2-1/2" | 8" | 8" | 1,649
AWH2000NPM | 1,999,000 | 98% | 2,375 | 51-1/2" | 34" | 65-1/2" | 75-1/2" | 7" | 5-3/4" | 14-3/4" | 7-1/4" | 46-3/4" | 6-3/4" | 18-3/4" | 23" | 23-1/2" | 1-1/2" | 2-1/2" | 8" | 8" | 1,911
AWH3000NPM | 3,000,000 | 98% | 3,564 | 67-1/4" | 48-1/4" | 79-3/4" | 93-3/4" | 4-3/4" | 6-3/4" | 17-3/4" | 8-3/4" | 60-1/4" | 8-1/2" | 25-1/2" | 29-1/2" | 40" | 2" | 4" | 10" | 10" | 3,147
AWH4000NPM | 4,000,000 | 98% | 4,752 | 67-1/4" | 48-1/4" | 96" | 110" | 5" | 7-1/2" | 17-3/4" | 8-3/4" | 60-1/4" | 8-1/2" | 25-1/2" | 29-1/2" | 40" | 2-1/2" | 4" | 12" | 12" | 3,694
If you wanted a specific models specs:
modelNo = 'AWH1000NPM'
mSpecs = [r for r in specsTables if r[0] == modelNo]
mSpecs = [[]] if mSpecs == [] else mSpecs # in case there is no match
mSpecs = dict(zip(specsTables[0], mSpecs[0])) # convert to dictionary
print(mSpecs)
output:
{'Model Number': 'AWH1000NPM', 'Btu/Hr Input': '999,000', 'Thermal Efficiency': '98%', 'GPH # 100ºF Rise': '1,187', 'A': '45"', 'B': '24"', 'C': '48"', 'D': '62"', 'E': '30-1/2"', 'F': '15-3/4"', 'G': '12"', 'H': '20"', 'I': '38"', 'J': '3-1/2"', 'K': '10-1/2"', 'L': '19-1/4"', 'M': '20"', 'Gas Conn.': '1-1/4"', 'Water Conn.': '2-1/2"', 'Air Inlet': '6"', 'Vent Size': '6"', 'Ship. Wt.': '494'}
The contents for constructing the table are within a script tag. You can extract the relevant string and re-create the table through string manipulation.
import requests, re
import pandas as pd
r = requests.get('https://www.lochinvar.com/products/commercial-water-heaters/armor-condensing-water-heater/').text
s = re.sub(r'\\"', '"', re.search(r'table":"([\s\S]+?)(?:","tableFootNote)', r).groups(1)[0])
lines = [i.split('\\t') for i in s.split('\\n')]
df = pd.DataFrame(lines[1:], columns = lines[:1])
df.head(5)
I am trying to scrape a table, which in some cells has a "graphical" element (arrow up/down) using R. Unfortunately, the library rvest function html_table seems to skip these elements. This is how such a cell with arrow looks like in HTML:
<td>
<span style="font-weight: bold; color: darkgreen">Ba2</span>
<i class="glyphicon glyphicon-arrow-down" title="negative outlook"></i>
</td>
The code I am using is:
require(rvest)
require(tidyverse)
url = "https://tradingeconomics.com/country-list/rating"
#bypass company firewall
download.file(url, destfile = "scrapedpage.html", quiet=TRUE)
content <- read_html("scrapedpage.html")
tables <- content %>% html_table(fill = TRUE, trim=TRUE)
But for example for the cell above, it gives me only Ba2 string. Is there a way to include also the arrows somehow (as text, e.g. Ba2 neg)? Solution in Python would be also useful, if R does not have such functionality.
Thank you!
I don't know if this is possible in R but in Python this will give you the required results.
I have tried to print the first few rows to give you an idea of how the data looks.
pos - Denotes Arrow-up and neg - Denotes Arrow-down
from bs4 import BeautifulSoup
import requests
url = 'https://tradingeconomics.com/country-list/rating'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
t = soup.find('table', attrs= {'id': 'ctl00_ContentPlaceHolder1_ctl01_GridView1'})
tr = t.findAll('tr')
for i in range(1,10):
tds = tr[i].findAll('td')
temp = []
for j in tds:
fa_down = j.find('i', class_='glyphicon-arrow-down')
fa_up = j.find('i', class_='glyphicon-arrow-up')
if fa_up:
print(f'{j.text.strip()} (pos)')
elif fa_down:
print(f'{j.text.strip()} (neg)')
else:
print(f'{j.text.strip()}')
Output:
+------------+---------+-----------+-----------+---------+---------+
| Field 1 | Field 2 | Field 3 | Field 4 | Field 5 | Field 6 |
+------------+---------+-----------+-----------+---------+---------+
| Albania | B+ | B1 | | | 35 |
| Andorra | BBB | | BBB+ | | 62 |
| Angola | CCC+ | Caa1 | CCC | | 21 |
| Argentina | CCC+ | Ca | CCC | CCC | 15 |
| Armenia | | Ba3 | B+ | | 16 |
| Aruba | BBB | | BB | | 52 |
| Australia | AAA | Aaa | AAA (neg) | AAA | 100 |
| Austria | AA+ | Aa1 | AA+ | AAA | 96 |
| Azerbaijan | BB+ | Ba2 (pos) | BB+ | | 48 |
+------------+---------+-----------+-----------+---------+---------+
I have to compare 2 different sources and identify all the mismatches for all IDs
Source_excel table
+-----+-------------+------+----------+
| id | name | City | flag |
+-----+-------------+------+----------+
| 101 | Plate | NY | Ready |
| 102 | Back washer | NY | Sold |
| 103 | Ring | MC | Planning |
| 104 | Glass | NMC | Ready |
| 107 | Cover | PR | Ready |
+-----+-------------+------+----------+
Source_dw table
+-----+----------+------+----------+
| id | name | City | flag |
+-----+----------+------+----------+
| 101 | Plate | NY | Planning |
| 102 | Nut | TN | Expired |
| 103 | Ring | MC | Planning |
| 104 | Top Wire | NY | Ready |
| 105 | Bolt | MC | Expired |
+-----+----------+------+----------+
Expected result
+-----+-------------+----------+------------+----------+------------+---------+------------------+
| ID | excel_name | dw_name | excel_flag | dw_flag | excel_city | dw_city | RESULT |
+-----+-------------+----------+------------+----------+------------+---------+------------------+
| 101 | Plate | Plate | Ready | Planning | NY | NY | FLAG_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | NAME_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | FLAG_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | CITY_MISMATCH |
| 103 | Ring | Ring | Planning | Planning | MC | MC | ALL_MATCH |
| 104 | Glass | Top Wire | Ready | Ready | NMC | NY | NAME_MISMATCH |
| 104 | Glass | Top Wire | Ready | Ready | NMC | NY | CITY_MISMATCH |
| 107 | Cover | | Ready | | PR | | MISSING IN DW |
| 105 | | Bolt | | Expired | | MC | MISSING IN EXCEL |
+-----+-------------+----------+------------+----------+------------+---------+------------------+
I'm new to python and I have tried the below query but it not giving the expected result.
import pandas as pd
source_excel = pd.read_csv('C:/Mypython/Newyork/excel.csv',encoding = "ISO-8859-1")
source_dw = pd.read_csv('C:/Mypython/Newyork/dw.csv',encoding = "ISO-8859-1")
comparison_result = pd.merge(source_excel,source_dw,on='ID',how='outer',indicator=True)
comparison_result.loc[(comparison_result['_merge'] == 'both') & (name_x != name_y), 'Result'] = 'NAME_MISMATCH'
comparison_result.loc[(comparison_result['_merge'] == 'both') & (city_x != city_y), 'Result'] = 'CITY_MISMATCH'
comparison_result.loc[(comparison_result['_merge'] == 'both') & (flag_x != flag_y), 'Result'] = 'FLAG_MISMATCH'
comparison_result.loc[comparison_result['_merge'] == 'left_only', 'Result'] = 'Missing in dw'
comparison_result.loc[comparison_result['_merge'] == 'right_only', 'Result'] = 'Missing in excel'
comparison_result.loc[comparison_result['_merge'] == 'both', 'Result'] = 'ALL_Match'
csv_column = comparison_result[['ID','name_x','name_y','city_x','city_y','flag_x','flag_y','Result']]
print(csv_column)
Is there any other way I can check all the condition and report each in separate row. If separate row not possible, atleast i need in same column separated by all mismatches. something like FLAG_MISMATCH,CITY_MISMATCH
You could do:
df = pd.merge(Source_excel, Source_dw, on = 'ID', how = 'left', suffixes = (None, '_dw'))
This will create a new dataframe like the one you want, although you'll have to reorder the columns as you want. Note that the '_dw' is a suffix and not a prefix in this case.
You can reorder the columns as you like by using this code:
#Complement with the order you want
df = df[['ID', 'excel_name']]
For the result column I think you'll have to create a column for each condition you're trying to check (at least that's the way I know how to). Here's an example:
#This will return 1 if there's a match and 0 otherwise
df['result_flag'] = df.apply(lambda x: 1 if x.excel_flag == x.flag_dw else 0, axis = 1)
Here is a way to do the scoring:
df['result'] = 0
# repeated mask / df.loc statements suggests a loop, over a list of tuples
mask = df['excel_flag'] != df['df_flag']
df.loc[mask, 'result'] += 1
mask = df['excel_name'] != df['dw_name']
df.loc[mask, 'result'] += 10
df['result'] = df['result'].map({ 0: 'all match',
1: 'flag mismatch',
10: 'name mismatch',
11: 'all mismatch',})
I am trying to aggregate data in pyspark dataframe on a particular criteria. I am trying to align the acct based on switchOUT amount to switchIN amount. So that accounts with money switching out of becomes from account and other accounts become to_accounts.
Data I am getting in the dataframe to begin with
+--------+------+-----------+----------+----------+-----------+
| person | acct | close_amt | open_amt | switchIN | switchOUT |
+--------+------+-----------+----------+----------+-----------+
| A | 1 | 125 | 50 | 75 | 0 |
+--------+------+-----------+----------+----------+-----------+
| A | 2 | 100 | 75 | 25 | 0 |
+--------+------+-----------+----------+----------+-----------+
| A | 3 | 200 | 300 | 0 | 100 |
+--------+------+-----------+----------+----------+-----------+
To this table
+--------+--------+-----------+----------+----------+
| person | from_acct| to_acct | switchIN | switchOUT|
+--------+----------+--------+----------+-----------+
| A | 3 | 1 | 75 | 100 |
+--------+----------+--------+----------+-----------+
| A | 3 | 2 | 25 | 100 |
+--------+----------+--------+----------+-----------+
And also how can I do it so that it works for N number of rows (not just 3 accounts)
So far I have used this code
# define udf
def sorter(l):
res = sorted(l, key=operator.itemgetter(1))
return [item[0] for item in res]
def list_to_string(l):
res = 'from_fund_' +str(l[0]) + '_to_fund_'+str(l[1])
return res
def listfirstAcc(l):
res = str(l[0])
return res
def listSecAcc(l):
res = str(l[1])
return res
sort_udf = F.udf(sorter)
list_str = F.udf(list_to_string)
extractFirstFund = F.udf(listfirstAcc)
extractSecondFund = F.udf(listSecAcc)
# Add additional columns
df= df.withColumn("move", sort_udf("list_col").alias("sorted_list"))
df= df.withColumn("move_string", list_str("move"))
df= df.withColumn("From_Acct",extractFirstFund("move"))
df= df.withColumn("To_Acct",extractSecondFund("move"))
Current outcome I am getting:
+--------+--------+-----------+----------+----------+
| person | from_acct| to_acct | switchIN | switchOUT|
+--------+----------+--------+----------+-----------+
| A | 3 | 1,2 | 75 | 100 |
+--------+----------+--------+----------+-----------+
so I have the following class which prints the text and header you call with it. I also provided the code I am using to call the Print function. I have a string 'outputstring' which contains the text I want to print. My expected output is below and my actual output is below. It seems to be removing spaces which are needed for proper legibility. How can I print while keeping the spaces?
Class:
#Printer Class
class Printer(HtmlEasyPrinting):
def __init__(self):
HtmlEasyPrinting.__init__(self)
def GetHtmlText(self,text):
"Simple conversion of text. Use a more powerful version"
html_text = text.replace('\n\n','<P>')
html_text = text.replace('\n', '<BR>')
return html_text
def Print(self, text, doc_name):
self.SetHeader(doc_name)
self.PrintText(self.GetHtmlText(text),doc_name)
def PreviewText(self, text, doc_name):
self.SetHeader(doc_name)
HtmlEasyPrinting.PreviewText(self, self.GetHtmlText(text))
Expected Print:
+-------------------+---------------------------------+------+-----------------+-----------+
| Domain: | Mail Server: | TLS: | # of Employees: | Verified: |
+-------------------+---------------------------------+------+-----------------+-----------+
| bankofamerica.com | ltwemail.bankofamerica.com | Y | 239000 | Y |
| | rdnemail.bankofamerica.com | Y | | Y |
| | kcmemail.bankofamerica.com | Y | | Y |
| | rchemail.bankofamerica.com | Y | | Y |
| citigroup.com | mx-b.mail.citi.com | Y | 248000 | N |
| | mx-a.mail.citi.com | Y | | N |
| bnymellon.com | cluster9bny.us.messagelabs.com | ? | 51400 | N |
| | cluster9bnya.us.messagelabs.com | Y | | N |
| usbank.com | mail1.usbank.com | Y | 65565 | Y |
| | mail2.usbank.com | Y | | Y |
| | mail3.usbank.com | Y | | Y |
| | mail4.usbank.com | Y | | Y |
| us.hsbc.com | vhiron1.us.hsbc.com | Y | 255200 | Y |
| | vhiron2.us.hsbc.com | Y | | Y |
| | njiron1.us.hsbc.com | Y | | Y |
| | njiron2.us.hsbc.com | Y | | Y |
| | nyiron1.us.hsbc.com | Y | | Y |
| | nyiron2.us.hsbc.com | Y | | Y |
| pnc.com | cluster5a.us.messagelabs.com | Y | 49921 | N |
| | cluster5.us.messagelabs.com | ? | | N |
| tdbank.com | cluster5.us.messagelabs.com | ? | 0 | N |
| | cluster5a.us.messagelabs.com | Y | | N |
+-------------------+---------------------------------+------+-----------------+-----------+
Actual Print:
The same thing as expected but the spaces are removed making it very hard to read.
Function call:
def printFile():
outputstring = txt_tableout.get(1.0, 'end')
print(outputstring)
app = wx.PySimpleApp()
p = Printer()
p.Print(outputstring, "Data Results")
For anyone else struggling, this is the modified class function I used to generate a nice table with all rows and columns.
def GetHtmlText(self,text):
html_text = '<h3>Data Results:</h3><p><table border="2">'
html_text += "<tr><td>Domain:</td><td>Mail Server:</td><td>TLS:</td><td># of Employees:</td><td>Verified</td></tr>"
for row in root.ptglobal.to_csv():
html_text += "<tr>"
for x in range(len(row)):
html_text += "<td>"+str(row[x])+"</td>"
html_text += "</tr>"
return html_text + "</table></p>"
maybe try
`html_text = text.replace(' ',' ').replace('\n','<br/>')`
that would replace your spaces with html space characters ... but it would still not look right since it is not a monospace font ... this will be hard to automate ... you really want to probably put it in a table structure ... but that would require some work
you probably want to invest a little more time in your html conversion ... perhaps something like (making assumptions based on what you have shown)
def GetHtmlText(self,text):
"Simple conversion of text. Use a more powerful version"
text_lines = text.splitlines()
html_text = "<table>"
html_text += "<tr><th> + "</th><th>".join(text_lines[0].split(":")) + "</th></tr>
for line in text_lines[1:]:
html_text += "<tr><td>"+"</td><td>".join(line.split()) +"</td></tr>
return html_text + "</table>"