from bs4 import BeautifulSoup
import numpy as np
import requests
from selenium import webdriver
from nltk.tokenize import sent_tokenize,word_tokenize
html = webdriver.Firefox(executable_path=r'D:\geckodriver.exe')
html.get("https://www.tsa.gov/coronavirus/passenger-throughput")
def TSA_travel_numbers(html):
soup = BeautifulSoup(html,'lxml')
for i,rows in enumerate(soup.find('div',class_='view-content'),1):
# print(rows.content)
for header in rows.find('tr'):
number = rows.find_all('td',class_='views-field views-field field-2021-throughput views-align-center')
print(number.text)
TSA_travel_numbers(html.page_source)
My error as follows :
Traceback (most recent call last):
File "TSA_travel.py", line 23, in <module>
TSA_travel_numbers(html.page_source)
File "TSA_travel.py", line 15, in TSA_travel_numbers
for header in rows.find('tr'):
TypeError: 'int' object is not iterable
What is happening here?
I can't iter thru 'tr' tags, please help me to solve this problem.
Sorry for your time and advance thanks!
As the error says, you can't iterate over an int, which is your rows.
Also, there's no need for a webdriver as data on the page is static.
Here's my take on it:
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
def get_page(url):
return requests.get(url).text
def get_data(page):
soup = BeautifulSoup(page, 'lxml')
return [
item.getText(strip=True) for item in soup.select(".views-align-center")
]
def build_table(table_rows):
t = [table_rows[i:i + 4] for i in range(0, len(table_rows[1:]), 4)]
h = t[0]
return t[1:], h
if __name__ == '__main__':
source = "https://www.tsa.gov/coronavirus/passenger-throughput"
table, header = build_table(get_data(get_page(source)))
print(tabulate(table, headers=header, tablefmt="pretty"))
Output:
+------------+--------------------------+--------------------------+--------------------------+
| Date | 2021 Traveler Throughput | 2020 Traveler Throughput | 2019 Traveler Throughput |
+------------+--------------------------+--------------------------+--------------------------+
| 5/9/2021 | 1,707,805 | 200,815 | 2,419,114 |
| 5/8/2021 | 1,429,657 | 169,580 | 1,985,942 |
| 5/7/2021 | 1,703,267 | 215,444 | 2,602,631 |
| 5/6/2021 | 1,644,050 | 190,863 | 2,555,342 |
| 5/5/2021 | 1,268,938 | 140,409 | 2,270,662 |
| 5/4/2021 | 1,134,103 | 130,601 | 2,106,597 |
| 5/3/2021 | 1,463,672 | 163,692 | 2,470,969 |
| 5/2/2021 | 1,626,962 | 170,254 | 2,512,598 |
| 5/1/2021 | 1,335,535 | 134,261 | 1,968,278 |
| 4/30/2021 | 1,558,553 | 171,563 | 2,546,029 |
| 4/29/2021 | 1,526,681 | 154,695 | 2,499,461 |
| 4/28/2021 | 1,184,326 | 119,629 | 2,256,442 |
| 4/27/2021 | 1,077,199 | 110,913 | 2,102,068 |
| 4/26/2021 | 1,369,410 | 119,854 | 2,412,770 |
| 4/25/2021 | 1,571,220 | 128,875 | 2,506,809 |
| 4/24/2021 | 1,259,724 | 114,459 | 1,990,464 |
| 4/23/2021 | 1,521,393 | 123,464 | 2,521,897 |
| 4/22/2021 | 1,509,649 | 111,627 | 2,526,961 |
| 4/21/2021 | 1,164,099 | 98,968 | 2,254,209 |
| 4/20/2021 | 1,082,443 | 92,859 | 2,227,475 |
| 4/19/2021 | 1,412,500 | 99,344 | 2,594,171 |
| 4/18/2021 | 1,572,383 | 105,382 | 2,356,802 |
| 4/17/2021 | 1,277,815 | 97,236 | 1,988,205 |
| 4/16/2021 | 1,468,218 | 106,385 | 2,457,133 |
| 4/15/2021 | 1,491,435 | 95,085 | 2,616,158 |
| 4/14/2021 | 1,152,703 | 90,784 | 2,317,381 |
| 4/13/2021 | 1,085,034 | 87,534 | 2,208,688 |
| 4/12/2021 | 1,468,972 | 102,184 | 2,484,580 |
| 4/11/2021 | 1,561,495 | 90,510 | 2,446,801 |
and so on ...
Or, an ever shorter approach, just use pandas:
import pandas as pd
import requests
from tabulate import tabulate
if __name__ == '__main__':
source = "https://www.tsa.gov/coronavirus/passenger-throughput"
df = pd.read_html(requests.get(source).text, flavor="bs4")[0]
print(tabulate(df.head(10), tablefmt="pretty", showindex=False))
Output:
+-----------+-----------+--------+---------+
| 5/9/2021 | 1707805.0 | 200815 | 2419114 |
| 5/8/2021 | 1429657.0 | 169580 | 1985942 |
| 5/7/2021 | 1703267.0 | 215444 | 2602631 |
| 5/6/2021 | 1644050.0 | 190863 | 2555342 |
| 5/5/2021 | 1268938.0 | 140409 | 2270662 |
| 5/4/2021 | 1134103.0 | 130601 | 2106597 |
| 5/3/2021 | 1463672.0 | 163692 | 2470969 |
| 5/2/2021 | 1626962.0 | 170254 | 2512598 |
| 5/1/2021 | 1335535.0 | 134261 | 1968278 |
| 4/30/2021 | 1558553.0 | 171563 | 2546029 |
+-----------+-----------+--------+---------+
I am trying to scrape the th element, but the result keeps returning None. What am I doing wrong?
This is the code I have tried:
import requests
import bs4
import urllib3
dateList = []
openList = []
closeList = []
highList = []
lowList = []
r = requests.get(
'https://coinmarketcap.com/currencies/bitcoin/historical-data/')
soup = bs4.BeautifulSoup(r.text, 'lxml')
td = soup.find('th')
print(td)
There's an API endpoint so you can fetch the data from there.
Here's how:
import pandas as pd
import requests
from tabulate import tabulate
api_endpoint = "https://web-api.coinmarketcap.com/v1/cryptocurrency/ohlcv/historical?id=1&convert=USD&time_start=1609804800&time_end=1614902400"
bitcoin = requests.get(api_endpoint).json()
df = pd.DataFrame([q["quote"]["USD"] for q in bitcoin["data"]["quotes"]])
print(tabulate(df, headers="keys", showindex=False, disable_numparse=True, tablefmt="pretty"))
Output:
+----------------+----------------+----------------+----------------+--------------------+-------------------+--------------------------+
| open | high | low | close | volume | market_cap | timestamp |
+----------------+----------------+----------------+----------------+--------------------+-------------------+--------------------------+
| 34013.614533 | 36879.69856854 | 33514.03374162 | 36824.36441009 | 75289433810.59091 | 684671246323.6501 | 2021-01-06T23:59:59.999Z |
| 36833.87435728 | 40180.3679073 | 36491.18981083 | 39371.04235311 | 84762141031.49448 | 732062681138.1346 | 2021-01-07T23:59:59.999Z |
| 39381.76584266 | 41946.73935079 | 36838.63599637 | 40797.61071993 | 88107519479.50471 | 758625941266.7522 | 2021-01-08T23:59:59.999Z |
| 40788.64052286 | 41436.35000639 | 38980.87690625 | 40254.54649816 | 61984162837.0747 | 748563483043.1383 | 2021-01-09T23:59:59.999Z |
| 40254.21779758 | 41420.19103255 | 35984.62712175 | 38356.43950662 | 79980747690.35463 | 713304617760.9486 | 2021-01-10T23:59:59.999Z |
| 38346.52950301 | 38346.52950301 | 30549.59876946 | 35566.65594049 | 123320567398.62296 | 661457321418.0524 | 2021-01-11T23:59:59.999Z |
| 35516.36114084 | 36568.52697414 | 32697.97662163 | 33922.9605815 | 74773277909.4566 | 630920422745.0479 | 2021-01-12T23:59:59.999Z |
| 33915.11958124 | 37599.96059774 | 32584.66767186 | 37316.35939997 | 69364315979.27992 | 694069582193.7559 | 2021-01-13T23:59:59.999Z |
| 37325.10763475 | 39966.40524241 | 36868.5632453 | 39187.32812109 | 63615990033.01017 | 728904366964.3611 | 2021-01-14T23:59:59.999Z |
| 39156.7080858 | 39577.71118833 | 34659.58974449 | 36825.36585131 | 67760757880.723885 | 685005864471.3622 | 2021-01-15T23:59:59.999Z |
| 36821.64873201 | 37864.36887891 | 35633.55401669 | 36178.13890106 | 57706187875.104546 | 673000645230.8221 | 2021-01-16T23:59:59.999Z |
| 36163.64923243 | 36722.34987621 | 34069.32218533 | 35791.27792129 | 52359854336.21185 | 665831621390.9865 | 2021-01-17T23:59:59.999Z |
| 35792.23666766 | 37299.28580604 | 34883.84404829 | 36630.07568284 | 49511702429.3542 | 681470030572.0747 | 2021-01-18T23:59:59.999Z |
| 36642.23272357 | 37755.89185872 | 36069.80639361 | 36069.80639361 | 57244195485.50075 | 671081200699.8711 | 2021-01-19T23:59:59.999Z |
and so on ...
I think the request object doesn't have a text attribute.
Try soup = bs4.BeautifulSoup(r.content, 'lxml')
I have to compare 2 different sources and identify all the mismatches for all IDs
Source_excel table
+-----+-------------+------+----------+
| id | name | City | flag |
+-----+-------------+------+----------+
| 101 | Plate | NY | Ready |
| 102 | Back washer | NY | Sold |
| 103 | Ring | MC | Planning |
| 104 | Glass | NMC | Ready |
| 107 | Cover | PR | Ready |
+-----+-------------+------+----------+
Source_dw table
+-----+----------+------+----------+
| id | name | City | flag |
+-----+----------+------+----------+
| 101 | Plate | NY | Planning |
| 102 | Nut | TN | Expired |
| 103 | Ring | MC | Planning |
| 104 | Top Wire | NY | Ready |
| 105 | Bolt | MC | Expired |
+-----+----------+------+----------+
Expected result
+-----+-------------+----------+------------+----------+------------+---------+------------------+
| ID | excel_name | dw_name | excel_flag | dw_flag | excel_city | dw_city | RESULT |
+-----+-------------+----------+------------+----------+------------+---------+------------------+
| 101 | Plate | Plate | Ready | Planning | NY | NY | FLAG_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | NAME_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | FLAG_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | CITY_MISMATCH |
| 103 | Ring | Ring | Planning | Planning | MC | MC | ALL_MATCH |
| 104 | Glass | Top Wire | Ready | Ready | NMC | NY | NAME_MISMATCH |
| 104 | Glass | Top Wire | Ready | Ready | NMC | NY | CITY_MISMATCH |
| 107 | Cover | | Ready | | PR | | MISSING IN DW |
| 105 | | Bolt | | Expired | | MC | MISSING IN EXCEL |
+-----+-------------+----------+------------+----------+------------+---------+------------------+
I'm new to python and I have tried the below query but it not giving the expected result.
import pandas as pd
source_excel = pd.read_csv('C:/Mypython/Newyork/excel.csv',encoding = "ISO-8859-1")
source_dw = pd.read_csv('C:/Mypython/Newyork/dw.csv',encoding = "ISO-8859-1")
comparison_result = pd.merge(source_excel,source_dw,on='ID',how='outer',indicator=True)
comparison_result.loc[(comparison_result['_merge'] == 'both') & (name_x != name_y), 'Result'] = 'NAME_MISMATCH'
comparison_result.loc[(comparison_result['_merge'] == 'both') & (city_x != city_y), 'Result'] = 'CITY_MISMATCH'
comparison_result.loc[(comparison_result['_merge'] == 'both') & (flag_x != flag_y), 'Result'] = 'FLAG_MISMATCH'
comparison_result.loc[comparison_result['_merge'] == 'left_only', 'Result'] = 'Missing in dw'
comparison_result.loc[comparison_result['_merge'] == 'right_only', 'Result'] = 'Missing in excel'
comparison_result.loc[comparison_result['_merge'] == 'both', 'Result'] = 'ALL_Match'
csv_column = comparison_result[['ID','name_x','name_y','city_x','city_y','flag_x','flag_y','Result']]
print(csv_column)
Is there any other way I can check all the condition and report each in separate row. If separate row not possible, atleast i need in same column separated by all mismatches. something like FLAG_MISMATCH,CITY_MISMATCH
You could do:
df = pd.merge(Source_excel, Source_dw, on = 'ID', how = 'left', suffixes = (None, '_dw'))
This will create a new dataframe like the one you want, although you'll have to reorder the columns as you want. Note that the '_dw' is a suffix and not a prefix in this case.
You can reorder the columns as you like by using this code:
#Complement with the order you want
df = df[['ID', 'excel_name']]
For the result column I think you'll have to create a column for each condition you're trying to check (at least that's the way I know how to). Here's an example:
#This will return 1 if there's a match and 0 otherwise
df['result_flag'] = df.apply(lambda x: 1 if x.excel_flag == x.flag_dw else 0, axis = 1)
Here is a way to do the scoring:
df['result'] = 0
# repeated mask / df.loc statements suggests a loop, over a list of tuples
mask = df['excel_flag'] != df['df_flag']
df.loc[mask, 'result'] += 1
mask = df['excel_name'] != df['dw_name']
df.loc[mask, 'result'] += 10
df['result'] = df['result'].map({ 0: 'all match',
1: 'flag mismatch',
10: 'name mismatch',
11: 'all mismatch',})
I have 2 views as below:
experiments:
select * from experiments;
+--------+--------------------+-----------------+
| exp_id | exp_properties | value |
+--------+--------------------+-----------------+
| 1 | indicator:chemical | phenolphthalein |
| 1 | base | NaOH |
| 1 | acid | HCl |
| 1 | exp_type | titration |
| 1 | indicator:color | faint_pink |
+--------+--------------------+-----------------+
calculations:
select * from calculations;
+--------+------------------------+--------------+
| exp_id | exp_report | value |
+--------+------------------------+--------------+
| 1 | molarity:base | 0.500000000 |
| 1 | volume:acid:in_ML | 23.120000000 |
| 1 | volume:base:in_ML | 5.430000000 |
| 1 | moles:H | 0.012500000 |
| 1 | moles:OH | 0.012500000 |
| 1 | molarity:acid | 0.250000000 |
+--------+------------------------+--------------+
I managed to pivot each of these views individually as below:
experiments_pivot:
+-------+--------------------+------+------+-----------+----------------+
|exp_id | indicator:chemical | base | acid | exp_type | indicator:color|
+-------+--------------------+------+------+-----------+----------------+
| 1 | phenolphthalein | NaOH | HCl | titration | faint_pink |
+------+---------------------+------+------+-----------+----------------+
calculations_pivot:
+-------+---------------+---------------+--------------+-------------+------------------+-------------------+
|exp_id | molarity:base | molarity:acid | moles:H | moles:OH | volume:acid:in_ML| volume:base:in_ML |
+-------+---------------+---------------+--------------+-------------+------------------+-------------------+
| 1 | 0.500000000 | 0.250000000 | 0.012500000 | 0.012500000 | 23.120000000 | 5.430000000 |
+------+---------------------+------+------+-----------+----------------------------------------------------+
My question is how to get these two pivot results as a single row? Desired result is as below:
+-------+--------------------+------+------+-----------+----------------+--------------+---------------+--------------+-------------+------------------+------------------+
|exp_id | indicator:chemical | base | acid | exp_type | indicator:color|molarity:base | molarity:acid | moles:H | moles:OH | volume:acid:in_ML| volume:base:in_ML |
+-------+--------------------+------+------+-----------+----------------+--------------+---------------+--------------+-------------+------------------+------------------+
| 1 | phenolphthalein | NaOH | HCl | titration | faint_pink | 0.500000000 | 0.250000000 | 0.012500000 | 0.012500000 | 23.120000000 | 5.430000000 |
+------+---------------------+------+------+-----------+----------------+--------------+---------------+--------------+-------------+------------------+------------------+
Database Used: Mysql
Important Note: Each of these views can have increasing number of rows. Hence I considered "dynamic pivoting" for each of the view individually.
For reference -- Below is a prepared statement I used to pivot experiments in MySQL(and a similar statement to pivot the other view as well):
set #sql = Null;
SELECT
GROUP_CONCAT(DISTINCT
CONCAT(
'MAX(IF(exp_properties = ''',
exp_properties,
''', value, NULL)) AS ',
concat("`",exp_properties, "`")
)
)into #sql
from experiments;
set #sql = concat(
'select exp_id, ',
#sql,
' from experiment group by exp_id'
);
prepare stmt from #sql;
execute stmt;
so I have the following class which prints the text and header you call with it. I also provided the code I am using to call the Print function. I have a string 'outputstring' which contains the text I want to print. My expected output is below and my actual output is below. It seems to be removing spaces which are needed for proper legibility. How can I print while keeping the spaces?
Class:
#Printer Class
class Printer(HtmlEasyPrinting):
def __init__(self):
HtmlEasyPrinting.__init__(self)
def GetHtmlText(self,text):
"Simple conversion of text. Use a more powerful version"
html_text = text.replace('\n\n','<P>')
html_text = text.replace('\n', '<BR>')
return html_text
def Print(self, text, doc_name):
self.SetHeader(doc_name)
self.PrintText(self.GetHtmlText(text),doc_name)
def PreviewText(self, text, doc_name):
self.SetHeader(doc_name)
HtmlEasyPrinting.PreviewText(self, self.GetHtmlText(text))
Expected Print:
+-------------------+---------------------------------+------+-----------------+-----------+
| Domain: | Mail Server: | TLS: | # of Employees: | Verified: |
+-------------------+---------------------------------+------+-----------------+-----------+
| bankofamerica.com | ltwemail.bankofamerica.com | Y | 239000 | Y |
| | rdnemail.bankofamerica.com | Y | | Y |
| | kcmemail.bankofamerica.com | Y | | Y |
| | rchemail.bankofamerica.com | Y | | Y |
| citigroup.com | mx-b.mail.citi.com | Y | 248000 | N |
| | mx-a.mail.citi.com | Y | | N |
| bnymellon.com | cluster9bny.us.messagelabs.com | ? | 51400 | N |
| | cluster9bnya.us.messagelabs.com | Y | | N |
| usbank.com | mail1.usbank.com | Y | 65565 | Y |
| | mail2.usbank.com | Y | | Y |
| | mail3.usbank.com | Y | | Y |
| | mail4.usbank.com | Y | | Y |
| us.hsbc.com | vhiron1.us.hsbc.com | Y | 255200 | Y |
| | vhiron2.us.hsbc.com | Y | | Y |
| | njiron1.us.hsbc.com | Y | | Y |
| | njiron2.us.hsbc.com | Y | | Y |
| | nyiron1.us.hsbc.com | Y | | Y |
| | nyiron2.us.hsbc.com | Y | | Y |
| pnc.com | cluster5a.us.messagelabs.com | Y | 49921 | N |
| | cluster5.us.messagelabs.com | ? | | N |
| tdbank.com | cluster5.us.messagelabs.com | ? | 0 | N |
| | cluster5a.us.messagelabs.com | Y | | N |
+-------------------+---------------------------------+------+-----------------+-----------+
Actual Print:
The same thing as expected but the spaces are removed making it very hard to read.
Function call:
def printFile():
outputstring = txt_tableout.get(1.0, 'end')
print(outputstring)
app = wx.PySimpleApp()
p = Printer()
p.Print(outputstring, "Data Results")
For anyone else struggling, this is the modified class function I used to generate a nice table with all rows and columns.
def GetHtmlText(self,text):
html_text = '<h3>Data Results:</h3><p><table border="2">'
html_text += "<tr><td>Domain:</td><td>Mail Server:</td><td>TLS:</td><td># of Employees:</td><td>Verified</td></tr>"
for row in root.ptglobal.to_csv():
html_text += "<tr>"
for x in range(len(row)):
html_text += "<td>"+str(row[x])+"</td>"
html_text += "</tr>"
return html_text + "</table></p>"
maybe try
`html_text = text.replace(' ',' ').replace('\n','<br/>')`
that would replace your spaces with html space characters ... but it would still not look right since it is not a monospace font ... this will be hard to automate ... you really want to probably put it in a table structure ... but that would require some work
you probably want to invest a little more time in your html conversion ... perhaps something like (making assumptions based on what you have shown)
def GetHtmlText(self,text):
"Simple conversion of text. Use a more powerful version"
text_lines = text.splitlines()
html_text = "<table>"
html_text += "<tr><th> + "</th><th>".join(text_lines[0].split(":")) + "</th></tr>
for line in text_lines[1:]:
html_text += "<tr><td>"+"</td><td>".join(line.split()) +"</td></tr>
return html_text + "</table>"