Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Graphrag integration #4612

Open
wants to merge 43 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
e3e8f45
add initial global search draft
lpinheiroms Dec 7, 2024
8242378
add graphrag dep
lpinheiroms Dec 9, 2024
fb2fb19
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lpinheiroms Dec 10, 2024
a13c18b
fix local search embedding
lpinheiroms Dec 17, 2024
8f3c484
linting
lpinheiroms Dec 17, 2024
0c05047
add from config constructor
lpinheiroms Dec 17, 2024
0e53f91
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Dec 17, 2024
c1e7ea2
remove draft notebook
lpinheiroms Dec 17, 2024
a8b38ad
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Dec 19, 2024
6d61c8e
update config factory and add docstrings
lpinheiroms Dec 20, 2024
1c4ed3d
add graphrag sample
lpinheiroms Dec 20, 2024
95f329c
add sample prompts
lpinheiroms Dec 20, 2024
3bc104b
update readme
lpinheiroms Dec 20, 2024
2ae6812
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Dec 20, 2024
33523df
update deps
lpinheiroms Dec 20, 2024
8080ddb
Add API docs
ekzhu Dec 30, 2024
603c1c9
Update python/samples/agentchat_graphrag/requirements.txt
ekzhu Dec 30, 2024
934230b
Update python/samples/agentchat_graphrag/requirements.txt
ekzhu Dec 30, 2024
1c5fcd3
merge main, fix conflicts
lpinheiroms Dec 30, 2024
4f0c71f
update docstrings with snippet and doc ref
lpinheiroms Dec 30, 2024
e3dc1f9
lint
lpinheiroms Dec 30, 2024
f24fb6c
improve set up instructions in docstring
lpinheiroms Jan 3, 2025
4a5d611
lint
lpinheiroms Jan 3, 2025
74a2a23
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lpinheiroms Jan 3, 2025
cac2aef
update lock
lpinheiroms Jan 3, 2025
e42f027
Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_gl…
lspinheiro Jan 4, 2025
e60a9aa
Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_lo…
lspinheiro Jan 4, 2025
180373a
add unit tests
lpinheiroms Jan 7, 2025
e6f3eea
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Jan 7, 2025
4fc1fd8
update lock
lspinheiro Jan 7, 2025
ef49c97
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Jan 8, 2025
3c2735e
update uv lock
lspinheiro Jan 8, 2025
330bb18
add docstring newlines
lspinheiro Jan 8, 2025
47c5826
stubs and typing on graphrag tests
lspinheiro Jan 8, 2025
7b4142c
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Jan 9, 2025
c35147b
fix docstrings
lspinheiro Jan 9, 2025
2ac3edf
fix mypy error
lspinheiro Jan 9, 2025
dff93ea
+ linting and type fixes
lspinheiro Jan 9, 2025
b9b3ec0
type fix graphrag sample
lspinheiro Jan 9, 2025
483b567
Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_gl…
lspinheiro Jan 10, 2025
eec0d2b
Update python/packages/autogen-ext/src/autogen_ext/tools/graphrag/_lo…
lspinheiro Jan 10, 2025
6351b38
Update python/samples/agentchat_graphrag/requirements.txt
lspinheiro Jan 10, 2025
8fed349
Merge branch 'main' into lpinheiro/feat/add-graphrag-tools
lspinheiro Jan 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion python/packages/autogen-ext/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ langchain = ["langchain_core~= 0.3.3"]
azure = ["azure-core", "azure-identity"]
docker = ["docker~=7.0"]
openai = ["openai>=1.3", "aiofiles"]
file-surfer = ["markitdown>=0.0.1a2"]
file-surfer = [
"autogen-agentchat==0.4.0.dev11",
"markitdown>=0.0.1a2",
]
graphrag = ["graphrag>=1.0.0"]
web-surfer = [
"playwright>=1.48.0",
"pillow>=11.0.0",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from ._config import (
EmbeddingConfig,
GlobalContextConfig,
GlobalDataConfig,
LocalContextConfig,
LocalDataConfig,
MapReduceConfig,
SearchConfig,
)
from ._global_search import GlobalSearchTool
from ._local_search import LocalSearchTool

__all__ = [
"GlobalSearchTool",
"LocalSearchTool",
"GlobalDataConfig",
"LocalDataConfig",
"GlobalContextConfig",
"LocalContextConfig",
"MapReduceConfig",
"SearchConfig",
"EmbeddingConfig",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from typing import Callable, Literal, Optional

from pydantic import BaseModel


class DataConfig(BaseModel):
input_dir: str
entity_table: str = "create_final_nodes"
entity_embedding_table: str = "create_final_entities"
community_level: int = 2


class GlobalDataConfig(DataConfig):
community_table: str = "create_final_communities"
community_report_table: str = "create_final_community_reports"


class LocalDataConfig(DataConfig):
relationship_table: str = "create_final_relationships"
text_unit_table: str = "create_final_text_units"


class ContextConfig(BaseModel):
max_data_tokens: int = 8000


class GlobalContextConfig(ContextConfig):
use_community_summary: bool = False
shuffle_data: bool = True
include_community_rank: bool = True
min_community_rank: int = 0
community_rank_name: str = "rank"
include_community_weight: bool = True
community_weight_name: str = "occurrence weight"
normalize_community_weight: bool = True
max_data_tokens: int = 12000


class LocalContextConfig(ContextConfig):
text_unit_prop: float = 0.5
community_prop: float = 0.25
include_entity_rank: bool = True
rank_description: str = "number of relationships"
include_relationship_weight: bool = True
relationship_ranking_attribute: str = "rank"


class MapReduceConfig(BaseModel):
map_max_tokens: int = 1000
map_temperature: float = 0.0
reduce_max_tokens: int = 2000
reduce_temperature: float = 0.0
allow_general_knowledge: bool = False
json_mode: bool = False
response_type: str = "multiple paragraphs"


class SearchConfig(BaseModel):
max_tokens: int = 1500
temperature: float = 0.0
response_type: str = "multiple paragraphs"


class EmbeddingConfig(BaseModel):
api_key: Optional[str] = None
model: str
api_base: Optional[str] = None
deployment_name: Optional[str] = None
api_version: Optional[str] = None
api_type: Literal["azure", "openai"] = "openai"
organization: Optional[str] = None
azure_ad_token_provider: Optional[Callable[[], str]] = None
max_retries: int = 10
request_timeout: float = 180.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# mypy: disable-error-code="no-any-unimported,misc"
import pandas as pd
import tiktoken
from autogen_core import CancellationToken
from autogen_core.tools import BaseTool
from autogen_ext.models.openai import AzureOpenAIChatCompletionClient
from pydantic import BaseModel, Field

from graphrag.query.indexer_adapters import (
read_indexer_communities,
read_indexer_entities,
read_indexer_reports,
)
from graphrag.query.llm.base import BaseLLM
from graphrag.query.structured_search.global_search.community_context import GlobalCommunityContext
from graphrag.query.structured_search.global_search.search import GlobalSearch

from ._config import GlobalContextConfig as ContextConfig
from ._config import GlobalDataConfig as DataConfig
from ._config import MapReduceConfig
from ._model_adapter import GraphragOpenAiModelAdapter

_default_context_config = ContextConfig()
_default_mapreduce_config = MapReduceConfig()


class GlobalSearchToolArgs(BaseModel):
query: str = Field(..., description="The user query to perform global search on.")


class GlobalSearchToolReturn(BaseModel):
answer: str


class GlobalSearchTool(BaseTool[GlobalSearchToolArgs, GlobalSearchToolReturn]):
def __init__(
self,
token_encoder: tiktoken.Encoding,
llm: BaseLLM,
data_config: DataConfig,
context_config: ContextConfig = _default_context_config,
mapreduce_config: MapReduceConfig = _default_mapreduce_config,
):
super().__init__(
args_type=GlobalSearchToolArgs,
return_type=GlobalSearchToolReturn,
name="global_search_tool",
description="Perform a global search with given parameters using graphrag.",
)
# Use the provided LLM
self._llm = llm

# Load parquet files
community_df: pd.DataFrame = pd.read_parquet(f"{data_config.input_dir}/{data_config.community_table}.parquet") # type: ignore
entity_df: pd.DataFrame = pd.read_parquet(f"{data_config.input_dir}/{data_config.entity_table}.parquet") # type: ignore
report_df: pd.DataFrame = pd.read_parquet( # type: ignore
f"{data_config.input_dir}/{data_config.community_report_table}.parquet"
)
entity_embedding_df: pd.DataFrame = pd.read_parquet( # type: ignore
f"{data_config.input_dir}/{data_config.entity_embedding_table}.parquet"
)

communities = read_indexer_communities(community_df, entity_df, report_df)
reports = read_indexer_reports(report_df, entity_df, data_config.community_level)
entities = read_indexer_entities(entity_df, entity_embedding_df, data_config.community_level)

context_builder = GlobalCommunityContext(
community_reports=reports,
communities=communities,
entities=entities,
token_encoder=token_encoder,
)

context_builder_params = {
"use_community_summary": context_config.use_community_summary,
"shuffle_data": context_config.shuffle_data,
"include_community_rank": context_config.include_community_rank,
"min_community_rank": context_config.min_community_rank,
"community_rank_name": context_config.community_rank_name,
"include_community_weight": context_config.include_community_weight,
"community_weight_name": context_config.community_weight_name,
"normalize_community_weight": context_config.normalize_community_weight,
"max_tokens": context_config.max_data_tokens,
"context_name": "Reports",
}

map_llm_params = {
"max_tokens": mapreduce_config.map_max_tokens,
"temperature": mapreduce_config.map_temperature,
"response_format": {"type": "json_object"},
}

reduce_llm_params = {
"max_tokens": mapreduce_config.reduce_max_tokens,
"temperature": mapreduce_config.reduce_temperature,
}

self._search_engine = GlobalSearch(
llm=self._llm,
context_builder=context_builder,
token_encoder=token_encoder,
max_data_tokens=context_config.max_data_tokens,
map_llm_params=map_llm_params,
reduce_llm_params=reduce_llm_params,
allow_general_knowledge=mapreduce_config.allow_general_knowledge,
json_mode=mapreduce_config.json_mode,
context_builder_params=context_builder_params,
concurrent_coroutines=32,
response_type=mapreduce_config.response_type,
)

async def run(self, args: GlobalSearchToolArgs, cancellation_token: CancellationToken) -> GlobalSearchToolReturn:
result = await self._search_engine.asearch(args.query)
assert isinstance(result.response, str), "Expected response to be a string"
return GlobalSearchToolReturn(answer=result.response)

@classmethod
def from_config(
cls,
openai_client: AzureOpenAIChatCompletionClient,
data_config: DataConfig,
context_config: ContextConfig = _default_context_config,
mapreduce_config: MapReduceConfig = _default_mapreduce_config,
) -> "GlobalSearchTool":
"""Create a GlobalSearchTool instance from configuration.

Args:
openai_client: The Azure OpenAI client to use
data_config: Configuration for data sources
context_config: Configuration for context building
mapreduce_config: Configuration for map-reduce operations

Returns:
An initialized GlobalSearchTool instance
"""
llm_adapter = GraphragOpenAiModelAdapter(openai_client)
token_encoder = tiktoken.encoding_for_model(llm_adapter.model_name)

return cls(
token_encoder=token_encoder,
llm=llm_adapter,
data_config=data_config,
context_config=context_config,
mapreduce_config=mapreduce_config,
)
Loading