diff --git a/.gitignore b/.gitignore index 6cb38ad..f02a459 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__ .vscode _site posts/scraped +posts/scrape.py # pixi environments .pixi diff --git a/pixi.lock b/pixi.lock index 2358db9..4c51489 100644 --- a/pixi.lock +++ b/pixi.lock @@ -178,6 +178,8 @@ environments: - conda: https://conda.anaconda.org/conda-forge/win-64/zstandard-0.23.0-py311h53056dc_1.conda - conda: https://conda.anaconda.org/conda-forge/win-64/zstd-1.5.6-h0ea2cb4_0.conda - pypi: https://files.pythonhosted.org/packages/65/0b/74cec93a7b05edf4fc3ea1c899fe8a37f041d7b9d303c75abf7a162924e0/markdownify-0.14.1-py3-none-any.whl + - pypi: https://files.pythonhosted.org/packages/71/a9/bd88ac0bd498c91aab3aba2e393d1fa59f72a7243e9265ccbf4861ca4f64/playwright-1.49.1-py3-none-win_amd64.whl + - pypi: https://files.pythonhosted.org/packages/1d/0d/95993c08c721ec68892547f2117e8f9dfbcef2ca71e098533541b4a54d5f/pyee-12.0.0-py3-none-any.whl packages: - conda: https://conda.anaconda.org/conda-forge/win-64/_openmp_mutex-4.5-2_gnu.conda build_number: 8 @@ -1792,6 +1794,14 @@ packages: - pkg:pypi/platformdirs?source=hash-mapping size: 20448 timestamp: 1733232756001 +- pypi: https://files.pythonhosted.org/packages/71/a9/bd88ac0bd498c91aab3aba2e393d1fa59f72a7243e9265ccbf4861ca4f64/playwright-1.49.1-py3-none-win_amd64.whl + name: playwright + version: 1.49.1 + sha256: 47b23cb346283278f5b4d1e1990bcb6d6302f80c0aa0ca93dd0601a1400191df + requires_dist: + - greenlet==3.1.1 + - pyee==12.0.0 + requires_python: '>=3.9' - conda: https://conda.anaconda.org/conda-forge/noarch/prompt-toolkit-3.0.48-pyha770c72_1.conda sha256: 79fb7d1eeb490d4cc1b79f781bb59fe302ae38cf0a30907ecde75a7d399796cc md5: 368d4aa48358439e07a97ae237491785 @@ -1900,6 +1910,34 @@ packages: - pkg:pypi/pydantic-core?source=hash-mapping size: 1563380 timestamp: 1726526437164 +- pypi: https://files.pythonhosted.org/packages/1d/0d/95993c08c721ec68892547f2117e8f9dfbcef2ca71e098533541b4a54d5f/pyee-12.0.0-py3-none-any.whl + name: pyee + version: 12.0.0 + sha256: 7b14b74320600049ccc7d0e0b1becd3b4bd0a03c745758225e31a59f4095c990 + requires_dist: + - typing-extensions + - build ; extra == 'dev' + - flake8 ; extra == 'dev' + - flake8-black ; extra == 'dev' + - pytest ; extra == 'dev' + - black ; extra == 'dev' + - isort ; extra == 'dev' + - jupyter-console ; extra == 'dev' + - mkdocs ; extra == 'dev' + - mkdocs-include-markdown-plugin ; extra == 'dev' + - mkdocstrings[python] ; extra == 'dev' + - sphinx ; extra == 'dev' + - toml ; extra == 'dev' + - tox ; extra == 'dev' + - trio ; extra == 'dev' + - twine ; extra == 'dev' + - twisted ; extra == 'dev' + - validate-pyproject[all] ; extra == 'dev' + - trio ; python_full_version >= '3.7' and extra == 'dev' + - trio-typing ; python_full_version >= '3.7' and extra == 'dev' + - pytest-asyncio ; python_full_version >= '3.4' and extra == 'dev' + - pytest-trio ; python_full_version >= '3.7' and extra == 'dev' + requires_python: '>=3.8' - conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.18.0-pyhd8ed1ab_1.conda sha256: 0d6133545f268b2b89c2617c196fc791f365b538d4057ecd636d658c3b1e885d md5: b38dc0206e2a530e5c2cf11dc086b31a diff --git a/pixi.toml b/pixi.toml index f2f1a46..c29448d 100644 --- a/pixi.toml +++ b/pixi.toml @@ -19,3 +19,5 @@ pip = ">=24.3.1,<25" [pypi-dependencies] markdownify = ">=0.14.1, <0.15" +playwright = ">=1.49.1, <2" +ollama = "*" diff --git a/posts/parsing_webpages_with_llm.ipynb b/posts/parsing_webpages_with_llm.ipynb index 46e7807..298604e 100644 --- a/posts/parsing_webpages_with_llm.ipynb +++ b/posts/parsing_webpages_with_llm.ipynb @@ -32,20 +32,30 @@ "source": [ "## Reading a dynamic web pages and convert HTML to Markdown\n", "\n", - "The code for this part was written by ChatGPT. At least on Windows, the Playwright code cannot be run inside a Jupyter notebook, so I had to use a script. Here is the content of the strict, which downloads the dynamic HTML and converts it to Markdown and saves the Markdown files in the subdirectory `scraped`.\n", + "The code for this part was written by ChatGPT. At least on Windows, the Playwright code cannot be run inside a Jupyter notebook, so I had to use a script. Here is the content of the strict, which downloads the dynamic HTML and converts it to Markdown and saves the Markdown files in the subdirectory `scraped`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "from pathlib import Path\n", "\n", - "```py\n", + "script = r\"\"\"\n", "from playwright.sync_api import sync_playwright\n", "from markdownify import markdownify as md\n", "from pathlib import Path\n", "\n", - "urls = \"\"\"\n", + "urls = '''\n", "https://inspirehep.net/literature/1889335\n", "https://inspirehep.net/literature/2512593\n", "https://inspirehep.net/literature/2017107\n", "https://inspirehep.net/literature/2687746\n", - "https://inspirehep.net/literature/1928162\n", - "\"\"\"\n", + "https://inspirehep.net/literature/2727838\n", + "'''\n", "\n", "urls = [x.strip() for x in urls.split(\"\\n\") if x and not x.isspace()]\n", "\n", @@ -84,7 +94,13 @@ "\n", "\n", "scrape_to_markdown(urls, \"scraped\")\n", - "```" + "\"\"\"\n", + "\n", + "if not Path(\"scraped\").exists():\n", + " with open(\"scrape.py\", \"w\", encoding=\"utf-8\") as f:\n", + " f.write(script)\n", + "\n", + " subprocess.run([\"python\", \"scrape.py\"])" ] }, { @@ -165,20 +181,6 @@ "The converted Markdown contains mistakes, where the conversion process garbled up the structure of the document. Let's see whether the LLM can make sense of this raw text. We want it to extract the authors, the journal data, the title, and the DOI." ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import ollama\n", - "from pathlib import Path\n", - "\n", - "input_dir = Path(\"scraped\")\n", - "\n", - "documents = [fn.open(encoding=\"utf-8\").read() for fn in input_dir.glob(\"*.md\")]" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -196,43 +198,42 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import ollama\n", + "from pathlib import Path\n", + "\n", + "input_dir = Path(\"scraped\")\n", + "\n", + "documents = [fn.open(encoding=\"utf-8\").read() for fn in input_dir.glob(\"*.md\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.0: Roel Aaij, JHEP 01 (2022) 166, \"Measurement of prompt charged-particle production in pp collisions at s=13 TeV\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166)\n", - "0.1: Roel Aaij, JHEP 01 (2022) 166, \"Measurement of prompt charged-particle production in pp collisions at s=13 TeV\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166)\n", - "0.2: Roel Aaij, JHEP 01 (2022) 166, \"Measurement of prompt charged-particle production in pp collisions at s=13 TeV\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166).\n", - "1.0: Flavia Gesualdi, Hans Dembinski, Kenji Shinozaki, Daniel A. Supanitsky, Tanguy Pierog, PoS ICRC2021 (2021) 473, \"On the muon scale of air showers and its application to the AGASA data\", [10.22323/1.395.0473](https://doi.org/10.22323/1.395.0473)\n", - "1.1: Flavia Gesualdi, Hans Dembinski, Kenji Shinozaki, Daniel A. Supanitsky, Tanguy Pierog, PoS ICRC2021 (2021) 473, \"On the muon scale of air showers and its application to the AGASA data\", [10.22323/1.395.0473](https://doi.org/10.22323/1.395.0473).\n", - "1.2: Flavia Gesualdi, Hans Dembinski, Kenji Shinozaki, Daniel A. Supanitsky, Tanguy Pierog, PoS ICRC2021 (2021) 473, \"On the muon scale of air showers and its application to the AGASA data\", [10.22323/1.395.0473](https://doi.org/10.22323/1.395.0473).\n", - "2.0: Tanguy Pierog, Sebastian Baur, Hans Dembinski, Matías Perlin, Ralf Ulrich, PoS ICRC2021 (2021) 469, \"When heavy ions meet cosmic rays: potential impact of QGP formation on the muon puzzle\", [10.22323/1.395.0469](https://doi.org/10.22323/1.395.0469).\n", - "2.1: Tanguy Pierog, Sebastian Baur, Hans Dembinski, Matías Perlin, Ralf Ulrich, PoS ICRC2021 (2021) 469, \"When heavy ions meet cosmic rays: potential impact of QGP formation on the muon puzzle\", [10.22323/1.395.0469](https://doi.org/10.22323/1.395.0469).\n", - "2.2: Tanguy Pierog, Sebastian Baur, Hans Dembinski, Matías Perlin, Ralf Ulrich, PoS ICRC2021 (2021) 469, \"When heavy ions meet cosmic rays: potential impact of QGP formation on the muon puzzle\", [10.22323/1.395.0469](https://doi.org/10.22323/1.395.0469).\n", - "3.0: Hans Dembinski, Matthew Kenzie, Christoph Langenbruch, Michael Schmelling, Nucl.Instrum.Meth.A 1040 (2022) 167270, \"Custom Orthogonal Weight functions (COWs) for event classification\", [10.1016/j.nima.2022.167270](https://doi.org/10.1016/j.nima.2022.167270)\n", - "3.1: Hans Dembinski, Matthew Kenzie, Christoph Langenbruch, Michael Schmelling, Nucl.Instrum.Meth.A 1040 (2022) 167270, \"Custom Orthogonal Weight functions (COWs) for event classification\", [10.1016/j.nima.2022.167270](https://doi.org/10.1016/j.nima.2022.167270).\n", - "3.2: Hans Dembinski, Matthew Kenzie, Christoph Langenbruch, Michael Schmelling, Nucl.Instrum.Meth.A 1040 (2022) 167270, \"Custom Orthogonal Weight functions (COWs) for event classification\", [10.1016/j.nima.2022.167270](https://doi.org/10.1016/j.nima.2022.167270).\n", - "4.0: Johannes Albrecht, Lorenzo Cazon, Hans Dembinski, Anatoli Fedynitch, Karl-Heinz Kampert, Astrophys.Space Sci. 367 (2022) 3, \"The Muon Puzzle in cosmic-ray induced air showers and its connection to the Large Hadron Collider\", [10.1007/s10509-022-04054-5](https://doi.org/10.1007/s10509-022-04054-5).\n", - "4.1: Johannes Albrecht, Lorenzo Cazon, Hans Dembinski, Anatoli Fedynitch, Karl-Heinz Kampert, Astrophys.Space Sci., \"The Muon Puzzle in cosmic-ray induced air showers and its connection to the Large Hadron Collider\", [10.1007/s10509-022-04054-5](https://doi.org/10.1007/s10509-022-04054-5)\n", - "4.2: Johannes Albrecht, Hans Dembinski, Anatoli Fedynitch, Karl-Heinz Kampert, Astrophys.Space Sci., \"The Muon Puzzle in cosmic-ray induced air showers and its connection to the Large Hadron Collider\", [10.1007/s10509-022-04054-5](https://doi.org/10.1007/s10509-022-04054-5)\n", - "5.0: Hans Peter Dembinski, Ahmed Abdelmotteleb, Eur.Phys.J.C 82 (2022) 1043, \"A new maximum-likelihood method for template fits\", [10.1140/epjc/s10052-022-11019-z](https://doi.org/10.1140/epjc/s10052-022-11019-z).\n", - "5.1: Hans Peter Dembinski, Ahmed Abdelmotteleb, Eur.Phys.J.C 82 (2022) 1043, \"A new maximum-likelihood method for template fits\", [10.1140/epjc/s10052-022-11019-z](https://doi.org/10.1140/epjc/s10052-022-11019-z).\n", - "5.2: Hans Peter Dembinski, Ahmed Abdelmotteleb, Eur.Phys.J.C 82 (2022) 1043, \"A new maximum-likelihood method for template fits\", [10.1140/epjc/s10052-022-11019-z](https://doi.org/10.1140/epjc/s10052-022-11019-z)\n", - "6.0: David Maurin, Markus Ahlers, Hans Dembinski, Andreas Haungs, Pierre-Simon Mangeard, Eur. Phys. J. C 83 (2023) 971, \"A cosmic-ray database update: CRDB v4.1\", [10.1140/epjc/s10052-023-12092-8](https://doi.org/10.1140/epjc/s10052-023-12092-8).\n", - "6.1: David Maurin, Markus Ahlers, Hans Dembinski, Andreas Haungs, Pierre-Simon Mangeard, Eur. Phys. J. C 83 (2023) 971, \"A cosmic-ray database update: CRDB v4.1\", [10.1140/epjc/s10052-023-12092-8](https://doi.org/10.1140/epjc/s10052-023-12092-8).\n", - "6.2: David Maurin, Markus Ahlers, Hans Dembinski, Andreas Haungs, Pierre-Simon Mangeard, Eur.Phys.J.C 83 (2023) 971, \"A cosmic-ray database update: CRDB v4.1\", [10.1140/epjc/s10052-023-12092-8](https://doi.org/10.1140/epjc/s10052-023-12092-8).\n", - "7.0: Hans Dembinski, Anatoli Fedynitch, Anton Prosekin, PoS ICRC2023 (2023) 189, \"Chromo: An event generator frontend for particle and astroparticle physics\", [10.22323/1.444.0189](https://doi.org/10.22323/1.444.0189).\n", - "7.1: Hans Dembinski, Anatoli Fedynitch, Anton Prosekin, PoS ICRC2023 (2023) 189, \"Chromo: An event generator frontend for particle and astroparticle physics\", [10.22323/1.444.0189](https://doi.org/10.22323/1.444.0189).\n", - "7.2: Hans Dembinski, Anatoli Fedynitch, Anton Prosekin, PoS ICRC2023 (2023) 189, \"Chromo: An event generator frontend for particle and astroparticle physics\", [10.22323/1.444.0189](https://doi.org/10.22323/1.444.0189).\n", - "8.0: L. Cazon, H.P. Dembinski, G. Parente, F. Riehn, A.A. Watson, PoS ICRC2023 (2023) 431, \"The muon measurements of Haverah Park and their connection to the muon puzzle\", [10.22323/1.444.0431](https://doi.org/10.22323/1.444.0431).\n", - "8.1: L. Cazon, H.P. Dembinski, G. Parente, F. Riehn, A.A. Watson, PoS ICRC2023 (2023) 431, \"The muon measurements of Haverah Park and their connection to the muon puzzle\", [10.22323/1.444.0431](https://doi.org/10.22323/1.444.0431)\n", - "8.2: L. Cazon, H.P. Dembinski, G. Parente, F. Riehn, A.A. Watson, PoS ICRC2023 (2023) 431, \"The muon measurements of Haverah Park and their connection to the muon puzzle\", [10.22323/1.444.0431](https://doi.org/10.22323/1.444.0431).\n", - "9.0: Hans Dembinski, Michael Schmelling, arXiv:2110.00294, \"Bias, variance, and confidence intervals for efficiency estimators in particle physics experiments\", [arxiv.org/abs/2110.00294](https://arxiv.org/abs/2110.00294).\n", - "9.1: Hans Dembinski, Michael Schmelling, arXiv:2110.00294, \"Bias, variance, and confidence intervals for efficiency estimators in particle physics experiments\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166).\n", - "9.2: Hans Dembinski, Michael Schmelling, arXiv:2110.00294, \"Bias, variance, and confidence intervals for efficiency estimators in particle physics experiments\", [https://arxiv.org/abs/2110.00294](https://arxiv.org/abs/2110.00294)\n" + "0.0: Roel Aaij et al., JHEP 01 (2022) 166, \"Measurement of prompt charged-particle production in pp collisions at s=13 TeV\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166)\n", + "0.1: Roel Aaij et al., JHEP 01 (2022) 166, \"Measurement of prompt charged-particle production in pp collisions at s=13 TeV\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166)\n", + "0.2: Roel Aaij et al., JHEP 01 (2022) 166, \"Measurement of prompt charged-particle production in pp collisions at s=13 TeV\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166)\n", + "1.0: Johannes Albrecht, Hans Dembinski, Anatoli Fedynitch, Karl-Heinz Kampert, Astropart.Space Sci. 367 (2022) 3, 27, \"The Muon Puzzle in cosmic-ray induced air showers and its connection to the Large Hadron Collider\", [10.1007/s10509-022-04054-5](https://doi.org/10.1007/s10509-022-04054-5)\n", + "1.1: Johanes Albrecht, Lorenzo Cazon, Hans Dembinski, Anatoli Fedynitch, Karl-Heinz Kampert, Astrophys.Space Sci., \"The Muon Puzzle in cosmic-ray induced air showers and its connection to the Large Hadron Collider\", [10.1007/s10509-022-04054-5](https://doi.org/10.1007/s10509-022-04054-5)\n", + "1.2: Johannes Albrecht et al., Astrophys.Space Sci. 367 (2022) 3, \"The Muon Puzzle in cosmic-ray induced air showers and its connection to the Large Hadron Collider\", [10.1007/s10509-022-04054-5](https://doi.org/10.1007/s10509-022-04054-5)\n", + "2.0: Hans Peter Dembinski, Ahmed Abdelmotteleb, Eur.Phys.J.C 82 (2022) 1043, \"A new maximum-likelihood method for template fits\", [10.1140/epjc/s10052-022-11019-z](https://doi.org/10.1140/epjc/s10052-022-11019-z).\n", + "2.1: Hans Peter Dembinski, Ahmed Abdelmotteleb, Eur.Phys.J.C 82 (2022) 1043, \"A new maximum-likelihood method for template fits\", [10.1140/epjc/s10052-022-11019-z](https://doi.org/10.1140/epjc/s10052-022-11019-z).\n", + "2.2: Hans Peter Dembinski, Ahmed Abdelmotteleb, Eur.Phys.J.C 82 (2022) 1043, \"A new maximum-likelihood method for template fits\", [10.1140/epjc/s10052-022-11019-z](https://doi.org/10.1140/epjc/s10052-022-11019-z).\n", + "3.0: L. Cazon et al., PoS ICRC2023 (2023) 431, \"The muon measurements of Haverah Park and their connection to the muon puzzle\", [10.22323/1.444.0431](https://doi.org/10.22323/1.444.0431).\n", + "3.1: L. Cazon et al., PoS ICRC2023 (2023) 431, \"The muon measurements of Haverah Park and their connection to the muon puzzle\", [10.22323/1.444.0431](https://doi.org/10.22323/1.444.0431).\n", + "3.2: L. Cazon et al., PoS ICRC2023 (2023) 431, \"The muon measurements of Haverah Park and their connection to the muon puzzle\", [10.22323/1.444.0431](https://doi.org/10.22323/1.444.0431).\n", + "4.0: Hans Dembinski, Michael Schmelling, \"Bias, variance, and confidence intervals for efficiency estimators in particle physics experiments\", [arXiv:2110.00294](https://arxiv.org/abs/2110.00294)\n", + "4.1: Hans Dembinski, Michael Schmelling, arXiv:2110.00294, \"Bias, variance, and confidence intervals for efficiency estimators in particle physics experiments\", [10.1007/JHEP01(2022)166](https://doi.org/10.1007/JHEP01(2022)166)\n", + "4.2: Hans Dembinski, Michael Schmelling, \"Bias, variance, and confidence intervals for efficiency estimators in particle physics experiments\", arXiv:2110.00294\n" ] } ], @@ -266,9 +267,10 @@ " d = doc[:doc.index(\"###\")]\n", " prompt = prompt_template.format(text=d)\n", " for trial in range(3):\n", - " response = ollama.generate(model='llama3-chatqa', prompt=prompt, options={\"temperature\": 0.3})\n", - " # as a tiny bit of post-processing we replace newlines with spaces\n", - " text = response.response.replace('\\n', '')\n", + " # a low temperate seems to make the output more reliable\n", + " response = ollama.generate(model='llama3-chatqa', prompt=prompt, options={\"temperature\": 0.3, \"seed\": trial})\n", + " # tiny bit of post-processing: replace newlines with spaces, trim whitespace\n", + " text = response.response.replace('\\n', '').strip()\n", " print(f\"{idoc}.{trial}: {text}\")" ] }, diff --git a/posts/scrape_to_markdown.py b/posts/scrape_to_markdown.py deleted file mode 100644 index 4f58d9e..0000000 --- a/posts/scrape_to_markdown.py +++ /dev/null @@ -1,58 +0,0 @@ -from playwright.sync_api import sync_playwright -from markdownify import markdownify as md -from pathlib import Path - -urls = """ -https://inspirehep.net/literature/1889335 -https://inspirehep.net/literature/2512593 -https://inspirehep.net/literature/2017107 -https://inspirehep.net/literature/2687746 -https://inspirehep.net/literature/1928162 -""" - -urls = [x.strip() for x in urls.split("\n") if x and not x.isspace()] - - -def scrape_to_markdown(urls, output_dir): - output_dir = Path(output_dir) - output_dir.mkdir(exist_ok=True, parents=True) - - with sync_playwright() as p: - # Launch a headless browser - browser = p.chromium.launch(headless=True) - - for url in urls: - output_fn = ( - url.replace("://", "_").replace("/", "_").replace(".", "_") + ".md" - ) - ofile = output_dir / output_fn - page = browser.new_page() - - # Navigate to the page - page.goto(url) - - # Wait for JavaScript-rendered content to load - # (Adjust the selector or timeout as needed for your specific page) - page.wait_for_load_state( - "networkidle" - ) # Wait for network requests to finish - - # Get the rendered HTML content - rendered_html = page.content() - - page.close() - - # Convert HTML to Markdown - markdown_content = md(rendered_html) - - # Save the Markdown to a file - with open(ofile, "w", encoding="utf-8") as file: - file.write(markdown_content) - - print(f"Saved {ofile!r}") - - # Close the browser - browser.close() - - -scrape_to_markdown(urls, "scraped")