{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.chrome.options import Options\n", "from selenium.common.exceptions import NoSuchElementException\n", "\n", "options = Options()\n", "#options.headless = True\n", "b = webdriver.Chrome(options=options)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "still waiting\n", "still waiting\n", "still waiting\n", "still waiting\n" ] } ], "source": [ "b.get(\"https://tyler.caraza-harter.com/cs320/s20/materials/lec-19/page1.html\")\n", "\n", "# poll (check) until we have the data\n", "for i in range(40):\n", " try:\n", " b.find_element_by_id(\"coords\")\n", " break\n", " except NoSuchElementException:\n", " print(\"still waiting\")\n", " time.sleep(0.25)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "page = BeautifulSoup(b.page_source)\n", "tbls = page.find_all(\"table\")\n", "assert len(tbls) == 2" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['x', 'y'],\n", " ['0', '1'],\n", " ['2', '3'],\n", " ['4', '5'],\n", " ['6', '7'],\n", " ['8', '9'],\n", " ['10', '11'],\n", " ['12', '13'],\n", " ['14', '15'],\n", " ['16', '17'],\n", " ['18', '19']]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rows = []\n", "for tr in tbls[1].find_all(\"tr\"):\n", " rows.append([td.get_text() for td in tr.find_all(\"td\")])\n", "rows" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
xy
001
123
245
367
489
51011
61213
71415
81617
91819
\n", "
" ], "text/plain": [ " x y\n", "0 0 1\n", "1 2 3\n", "2 4 5\n", "3 6 7\n", "4 8 9\n", "5 10 11\n", "6 12 13\n", "7 14 15\n", "8 16 17\n", "9 18 19" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "pd.DataFrame(rows[1:], columns=rows[0])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "b.close()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "options = Options()\n", "#options.headless = True\n", "b = webdriver.Chrome(options=options)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# page2.html\n", "b.get(\"https://tyler.caraza-harter.com/cs320/s20/materials/lec-19/page2.html\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "while True:\n", " try:\n", " btn = b.find_element_by_id(\"more\")\n", " except NoSuchElementException:\n", " break\n", " btn.click()\n", " time.sleep(2)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "page = BeautifulSoup(b.page_source)\n", "tbls = page.find_all(\"table\")\n", "assert len(tbls) == 1" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "b.close()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "options = Options()\n", "#options.headless = True\n", "b = webdriver.Chrome(options=options)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# page3.html\n", "b.get(\"https://tyler.caraza-harter.com/cs320/s20/materials/lec-19/page3.html\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "btn = b.find_element_by_id(\"login_btn\")\n", "btn" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pw = b.find_element_by_id(\"password\")\n", "pw" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "pw.clear()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "pw.send_keys(\"fido\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "btn.click()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "b.close()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "options = Options()\n", "#options.headless = True\n", "b = webdriver.Chrome(options=options)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# page4.html\n", "b.get(\"https://tyler.caraza-harter.com/cs320/s20/materials/lec-19/page4.html\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year_box = b.find_element_by_id(\"year\")\n", "year_box" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "search_btn = b.find_element_by_id(\"search_btn\")\n", "search_btn" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def hurricane_count(year):\n", " year_box.clear()\n", " year_box.send_keys(str(year))\n", " search_btn.click()\n", " \n", " trs = b.find_elements_by_tag_name(\"tr\")\n", " return len(trs) - 1\n", "\n", "hurricane_count(1950)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "s = pd.Series()\n", "for year in range(1950, 2020):\n", " s.loc[year] = hurricane_count(year)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "import matplotlib\n", "matplotlib.rcParams[\"font.size\"] = 16" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "ax = s.plot.line(color=\"r\")\n", "ax.set_xlabel(\"Year\")\n", "ax.set_ylabel(\"Hurricane Count\")\n", "ax.spines[\"right\"].set_visible(False)\n", "ax.spines[\"top\"].set_visible(False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }