import requests
r = requests.get("https://tyler.caraza-harter.com/cs320/tricky/page1.html")
r.raise_for_status()
type(r)

requests.models.Response


html = r.text
type(html)

str


html[:200]

'<html>\n  <head>\n    <script src="https://code.jquery.com/jquery-3.4.1.js"></script>\n    <script>\n      function addTable() {\n        var html = "<table border=1 id=\'coords\'>\\n";\n        html += "<tr><'


from bs4 import BeautifulSoup
doc = BeautifulSoup(html, "html.parser")
type(doc)

bs4.BeautifulSoup


trs = doc.findAll("tr")
len(trs)

3

trs

[<tr><td>A</td><td>B</td><td>C</td></tr>,
 <tr><td>1</td><td>2</td><td>3</td></tr>,
 <tr><td>4</td><td>5</td><td>6</td></tr>]


str(trs[0])

'<tr><td>A</td><td>B</td><td>C</td></tr>'


trs[0].text

'ABC'


len(doc.findAll("td")) # 9 cells (td=table data) in whole page

9


len(trs[0].findAll("td")) # 3 cells (td=table data) in first row

3


def grab_table(url):
    r = requests.get(url)
    r.raise_for_status()
    doc = BeautifulSoup(r.text)
    rows = []
    for tr in doc.findAll("tr"):
        row = []
        for td in tr.findAll("td"):
            row.append(td.text)
        rows.append(row)
    return rows

rows = grab_table("https://tyler.caraza-harter.com/cs320/tricky/page1.html")
rows

[['A', 'B', 'C'], ['1', '2', '3'], ['4', '5', '6']]


import pandas as pd

header = rows[0]
data = rows[1:]
pd.DataFrame(data, columns=header)


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

options = Options()
options.headless = True
b = webdriver.Chrome(options=options)
type(b)

selenium.webdriver.chrome.webdriver.WebDriver


b.get("https://tyler.caraza-harter.com/cs320/tricky/page1.html") # go to a page
b.save_screenshot("shot1.png")

True


from IPython.core.display import Image
Image("shot1.png")


import time
time.sleep(3) # in seconds


b.save_screenshot("shot2.png")
Image("shot2.png")


print(b.page_source)

<html><head>
    <script src="https://code.jquery.com/jquery-3.4.1.js"></script>
    <script>
      function addTable() {
        var html = "<table border=1 id='coords'>\n";
        html += "<tr><td>x</td><td>y</td></tr>\n";
        for (i = 0; i < 10; i++) {
          html += "<tr><td>" + (i*2) + "</td><td>" + (i*2+1) + "</td></tr>\n";
        }
        html += "</table>";

        var t = document.createElement("div");
        t.innerHTML = html;
        document.body.appendChild(t);
      }
      
      function main() {
        setTimeout(addTable, 1000)
      }
      </script>
  </head>
  <body onload="main()">
    <h1>Welcome</h1>
    <h3>Here's a table</h3>
    <table border="1" id="alpha">
      <tbody><tr><td>A</td><td>B</td><td>C</td></tr>
      <tr><td>1</td><td>2</td><td>3</td></tr>
      <tr><td>4</td><td>5</td><td>6</td></tr>
    </tbody></table>

    <h3>And another one...</h3>
  

<div><table border="1" id="coords">
<tbody><tr><td>x</td><td>y</td></tr>
<tr><td>0</td><td>1</td></tr>
<tr><td>2</td><td>3</td></tr>
<tr><td>4</td><td>5</td></tr>
<tr><td>6</td><td>7</td></tr>
<tr><td>8</td><td>9</td></tr>
<tr><td>10</td><td>11</td></tr>
<tr><td>12</td><td>13</td></tr>
<tr><td>14</td><td>15</td></tr>
<tr><td>16</td><td>17</td></tr>
<tr><td>18</td><td>19</td></tr>
</tbody></table></div></body></html>


doc = BeautifulSoup(b.page_source)
tables = doc.findAll("table")
print(f"There are {len(tables)} table(s)")

There are 2 table(s)


tables = b.find_elements_by_tag_name("table")
print(f"There are {len(tables)} table(s)")

There are 2 table(s)


type(tables[0])

selenium.webdriver.remote.webelement.WebElement


print(tables[0].text)

A B C
1 2 3
4 5 6

Web Scraping¶

requests and BeautifulSoup¶

Selenium¶

Conclusion¶