Skip to content

Commit

Permalink
feature: Make SentenceSplitter's secondary_chunking_regex optional (r…
Browse files Browse the repository at this point in the history
  • Loading branch information
carolinebinley authored and raspawar committed Oct 7, 2024
1 parent d319e2c commit bf291ca
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions llama-index-core/llama_index/core/node_parser/text/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class SentenceSplitter(MetadataAwareTextSplitter):
paragraph_separator: str = Field(
default=DEFAULT_PARAGRAPH_SEP, description="Separator between paragraphs."
)
secondary_chunking_regex: str = Field(
secondary_chunking_regex: Optional[str] = Field(
default=CHUNKING_REGEX, description="Backup regex for splitting into sentences."
)

Expand All @@ -72,7 +72,7 @@ def __init__(
tokenizer: Optional[Callable] = None,
paragraph_separator: str = DEFAULT_PARAGRAPH_SEP,
chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None,
secondary_chunking_regex: str = CHUNKING_REGEX,
secondary_chunking_regex: Optional[str] = CHUNKING_REGEX,
callback_manager: Optional[CallbackManager] = None,
include_metadata: bool = True,
include_prev_next_rel: bool = True,
Expand Down Expand Up @@ -107,11 +107,17 @@ def __init__(
self._chunking_tokenizer_fn,
]

self._sub_sentence_split_fns = [
split_by_regex(secondary_chunking_regex),
split_by_sep(separator),
split_by_char(),
]
if secondary_chunking_regex:
self._sub_sentence_split_fns = [
split_by_regex(secondary_chunking_regex),
split_by_sep(separator),
split_by_char(),
]
else:
self._sub_sentence_split_fns = [
split_by_sep(separator),
split_by_char(),
]

@classmethod
def from_defaults(
Expand Down

0 comments on commit bf291ca

Please sign in to comment.