strategy
https://github.com/microsoft/graphrag/blob/1e10bd342e858319d18fc3be39b1bafd41ab2127/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py#L3
code:strategy.py
async def run_extract_entities(
llm: CompletionLLM,
docs: listDocument,
entity_types: EntityTypes,
reporter: VerbCallbacks | None,
args: StrategyConfig,
) -> EntityExtractionResult:
"""Run the entity extraction chain."""
encoding_name = args.get("encoding_name", "cl100k_base")
# Chunking Arguments
prechunked = args.get("prechunked", False)
chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
# Extraction Arguments
tuple_delimiter = args.get("tuple_delimiter", None)
record_delimiter = args.get("record_delimiter", None)
completion_delimiter = args.get("completion_delimiter", None)
extraction_prompt = args.get("extraction_prompt", None)
encoding_model = args.get("encoding_name", None)
max_gleanings = args.get("max_gleanings", defs.ENTITY_EXTRACTION_MAX_GLEANINGS)
# note: We're not using UnipartiteGraphChain.from_params
# because we want to pass "timeout" to the llm_kwargs
text_splitter = _create_text_splitter(
prechunked, chunk_size, chunk_overlap, encoding_name
)
extractor = GraphExtractor(
llm_invoker=llm,
prompt=extraction_prompt,
encoding_model=encoding_model,
max_gleanings=max_gleanings,
on_error=lambda e, s, d: (
reporter.error("Entity Extraction Error", e, s, d) if reporter else None
),
)
text_list = doc.text.strip() for doc in docs
# If it's not pre-chunked, then re-chunk the input
if not prechunked:
text_list = text_splitter.split_text("\n".join(text_list))
results = await extractor(
list(text_list),
{
"entity_types": entity_types,
"tuple_delimiter": tuple_delimiter,
"record_delimiter": record_delimiter,
"completion_delimiter": completion_delimiter,
},
)
graph = results.output
# Map the "source_id" back to the "id" field
for _, node in graph.nodes(data=True): # type: ignore
if node is not None:
node"source_id" = ",".join(
docsint(id).id for id in node"source_id".split(",")
)
for _, _, edge in graph.edges(data=True): # type: ignore
if edge is not None:
edge"source_id" = ",".join(
docsint(id).id for id in edge"source_id".split(",")
)
entities = [
({"name": item0, **(item1 or {})})
for item in graph.nodes(data=True)
if item is not None
]
graph_data = "".join(nx.generate_graphml(graph))
return EntityExtractionResult(entities, graph_data)
def _create_text_splitter(
prechunked: bool, chunk_size: int, chunk_overlap: int, encoding_name: str
) -> TextSplitter:
"""Create a text splitter for the extraction chain.
Args:
- prechunked - Whether the text is already chunked
- chunk_size - The size of each chunk
- chunk_overlap - The overlap between chunks
- encoding_name - The name of the encoding to use
Returns:
- output - A text splitter
"""
if prechunked:
return NoopTextSplitter()
return TokenTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
encoding_name=encoding_name,
)