chunk algorithm
code:chunk.py
"""A module containing _get_num_total, chunk, run_strategy and load_strategy methods definitions."""
from enum import Enum
from typing import Any, cast
import pandas as pd
from datashaper import (
ProgressTicker,
TableContainer,
VerbCallbacks,
VerbInput,
progress_ticker,
verb,
)
from .strategies.typing import ChunkStrategy as ChunkStrategy
from .typing import ChunkInput
def _get_num_total(output: pd.DataFrame, column: str) -> int:
num_total = 0
if isinstance(row, str):
num_total += 1
else:
num_total += len(row)
return num_total
class ChunkStrategyType(str, Enum):
"""ChunkStrategy class definition."""
tokens = "tokens"
sentence = "sentence"
def __repr__(self):
"""Get a string representation."""
return f'"{self.value}"'
@verb(name="chunk")
def chunk(
input: VerbInput,
column: str,
to: str,
callbacks: VerbCallbacks,
**_kwargs,
) -> TableContainer:
"""
Chunk a piece of text into smaller pieces.
## Usage
`yaml
verb: text_chunk
args:
column: <column name> # The name of the column containing the text to chunk, this can either be a column with text, or a column with a list[tupledoc_id, str] to: <column name> # The name of the column to output the chunks to
strategy: <strategy config> # The strategy to use to chunk the text, see below for more details
`
## Strategies
The text chunk verb uses a strategy to chunk the text. The strategy is an object which defines the strategy to use. The following strategies are available:
### tokens
This strategy uses the tokens library to chunk a piece of text. The strategy config is as follows: Note: In the future, this will likely be renamed to something more generic, like "openai_tokens".
`yaml
strategy:
type: tokens
chunk_size: 1200 # Optional, The chunk size to use, default: 1200
chunk_overlap: 100 # Optional, The chunk overlap to use, default: 100
`
### sentence
This strategy uses the nltk library to chunk a piece of text into sentences. The strategy config is as follows:
`yaml
strategy:
type: sentence
`
"""
if strategy is None:
strategy = {}
output = cast(pd.DataFrame, input.get_input())
strategy_name = strategy.get("type", ChunkStrategyType.tokens)
strategy_config = {**strategy}
strategy_exec = load_strategy(strategy_name)
num_total = _get_num_total(output, column)
tick = progress_ticker(callbacks.progress, num_total)
cast(
Any,
lambda x: run_strategy(strategy_exec, xcolumn, strategy_config, tick), ),
axis=1,
)
return TableContainer(table=output)
def run_strategy(
strategy: ChunkStrategy,
input: ChunkInput,
tick: ProgressTicker,
) -> list[str | tuple[liststr | None, str, int]]: """Run strategy method definition."""
if isinstance(input, str):
return [item.text_chunk for item in strategy(input, {**strategy_args}, tick)] # We can work with both just a list of text content
# or a list of tuples of (document_id, text content)
# text_to_chunk = '''
texts = []
for item in input:
if isinstance(item, str):
texts.append(item)
else:
strategy_results = strategy(texts, {**strategy_args}, tick)
results = []
for strategy_result in strategy_results:
doc_indices = strategy_result.source_doc_indices
if isinstance(input[doc_indices0], str): results.append(strategy_result.text_chunk)
else:
doc_ids = [inputdoc_idx0 for doc_idx in doc_indices] results.append((
doc_ids,
strategy_result.text_chunk,
strategy_result.n_tokens,
))
return results
def load_strategy(strategy: ChunkStrategyType) -> ChunkStrategy:
"""Load strategy method definition."""
match strategy:
case ChunkStrategyType.tokens:
from .strategies.tokens import run as run_tokens
return run_tokens
case ChunkStrategyType.sentence:
# NLTK
from graphrag.index.bootstrap import bootstrap
from .strategies.sentence import run as run_sentence
bootstrap()
return run_sentence
case _:
msg = f"Unknown strategy: {strategy}"
raise ValueError(msg)