Skip to content

Commit

Permalink
feat: Chunk visual (#29)
Browse files Browse the repository at this point in the history
* chore: Poetry + precommit

* chore: Poetry

* feat: Encoders

* chore: Clean notebook outputs

* feat: Added encoders to query

* chore: Added a note to fix delete for Pinecone

* chore: Linting fix

* fix: Fix Pinecone deletion by file_url

* fix: Pinecone delete

* feat: Chunk visual
  • Loading branch information
simjak authored Feb 11, 2024
1 parent 7c9b3b2 commit 2242fc3
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ COHERE_API_KEY=
# Optional for walkthrough
PINECONE_API_KEY=
PINECONE_HOST=
PINECONE_INDEX=
PINECONE_INDEX=
103 changes: 103 additions & 0 deletions dev/embedding.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from models.file import File, FileType\n",
"from service.embedding import EmbeddingService\n",
"from termcolor import colored\n",
"\n",
"PINECONE_INDEX = os.getenv(\"PINECONE_INDEX\", \"\")\n",
"PINECONE_API_KEY = os.getenv(\"PINECONE_API_KEY\", \"\")\n",
"PINECONE_HOST = os.getenv(\"PINECONE_HOST\", \"\")\n",
"\n",
"file = File(\n",
" type=FileType.pdf,\n",
" url=\"https://arxiv.org/pdf/2402.05131.pdf\"\n",
")\n",
"vector_credentials = {\n",
" \"type\": \"pinecone\",\n",
" \"config\": {\n",
" \"api_key\": PINECONE_API_KEY,\n",
" \"host\": PINECONE_HOST,\n",
" }\n",
" },\n",
"\n",
"embedding_service = EmbeddingService(\n",
" files=[file],\n",
" index_name=PINECONE_INDEX,\n",
" vector_credentials=vector_credentials\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docs = await embedding_service.generate_documents()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"chunks = await embedding_service.generate_chunks(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white']\n",
"\n",
"concatenated_document = \"\"\n",
"\n",
"for i, chunk in enumerate(chunks):\n",
" color = colors[i % len(colors)]\n",
" colored_text = colored(chunk.text, color)\n",
" print(colored_text)\n",
" concatenated_document += chunk.text + \" \"\n",
"\n",
"print(\"\\nConcatenated Document:\\n\", concatenated_document)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 0 additions & 2 deletions models/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@
from models.vector_database import VectorDatabase


# Step 1: Define the Encoder Enum
class EncoderEnum(str, Enum):
cohere = "cohere"
openai = "openai"
huggingface = "huggingface"
fastembed = "fastembed"


# Step 2: Use the Enum in RequestPayload
class RequestPayload(BaseModel):
files: List[File]
encoder: EncoderEnum
Expand Down
14 changes: 14 additions & 0 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ python-dotenv = "^1.0.1"
[tool.poetry.extras]
fastembed = ["fastembed"]

[tool.poetry.group.dev.dependencies]
termcolor = "^2.4.0"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand Down

0 comments on commit 2242fc3

Please sign in to comment.