feature: Make SentenceSplitter's secondary_chunking_regex optional (r…

…un-llama#15882)
raspawar · Oct 7, 2024 · bf291ca · bf291ca
1 parent d319e2c
commit bf291ca
Showing 1 changed file with 13 additions and 7 deletions.
diff --git a/llama-index-core/llama_index/core/node_parser/text/sentence.py b/llama-index-core/llama_index/core/node_parser/text/sentence.py
@@ -55,7 +55,7 @@ class SentenceSplitter(MetadataAwareTextSplitter):
     paragraph_separator: str = Field(
         default=DEFAULT_PARAGRAPH_SEP, description="Separator between paragraphs."
     )
-    secondary_chunking_regex: str = Field(
+    secondary_chunking_regex: Optional[str] = Field(
         default=CHUNKING_REGEX, description="Backup regex for splitting into sentences."
     )
 
@@ -72,7 +72,7 @@ def __init__(
         tokenizer: Optional[Callable] = None,
         paragraph_separator: str = DEFAULT_PARAGRAPH_SEP,
         chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None,
-        secondary_chunking_regex: str = CHUNKING_REGEX,
+        secondary_chunking_regex: Optional[str] = CHUNKING_REGEX,
         callback_manager: Optional[CallbackManager] = None,
         include_metadata: bool = True,
         include_prev_next_rel: bool = True,
@@ -107,11 +107,17 @@ def __init__(
             self._chunking_tokenizer_fn,
         ]
 
-        self._sub_sentence_split_fns = [
-            split_by_regex(secondary_chunking_regex),
-            split_by_sep(separator),
-            split_by_char(),
-        ]
+        if secondary_chunking_regex:
+            self._sub_sentence_split_fns = [
+                split_by_regex(secondary_chunking_regex),
+                split_by_sep(separator),
+                split_by_char(),
+            ]
+        else:
+            self._sub_sentence_split_fns = [
+                split_by_sep(separator),
+                split_by_char(),
+            ]
 
     @classmethod
     def from_defaults(