-
Notifications
You must be signed in to change notification settings - Fork 5.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add support to accept Dataframe as input to split text, and added relevant tests #6302
base: main
Are you sure you want to change the base?
Conversation
if len(self.data_inputs) == 0: | ||
msg = "DataFrame is empty" | ||
raise TypeError(msg) | ||
|
||
self.data_inputs.text_key = self.text_key | ||
try: | ||
documents = self.data_inputs.to_lc_documents() | ||
except Exception as e: | ||
msg = f"Error converting DataFrame to documents: {e}" | ||
raise TypeError(msg) from e | ||
else: | ||
if not self.data_inputs: | ||
msg = "No data inputs provided" | ||
raise TypeError(msg) | ||
|
||
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] | ||
documents = [] | ||
for _input in self.data_inputs: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if len(self.data_inputs) == 0: | |
msg = "DataFrame is empty" | |
raise TypeError(msg) | |
self.data_inputs.text_key = self.text_key | |
try: | |
documents = self.data_inputs.to_lc_documents() | |
except Exception as e: | |
msg = f"Error converting DataFrame to documents: {e}" | |
raise TypeError(msg) from e | |
else: | |
if not self.data_inputs: | |
msg = "No data inputs provided" | |
raise TypeError(msg) | |
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] | |
documents = [] | |
for _input in self.data_inputs: | |
if not len(self.data_inputs): | |
raise TypeError("DataFrame is empty") | |
raise TypeError(f"Error converting DataFrame to documents: {e}") from e | |
raise TypeError("No data inputs provided") | |
try: | |
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] | |
if len(documents) != len(self.data_inputs): | |
raise TypeError("Unsupported input types present") | |
except Exception as e: | |
raise TypeError(f"Error processing inputs: {e}") from e | |
raise TypeError(f"Error splitting text: {e}") from e |
⚡️ Codeflash found optimizations for this PR📄 39% (0.39x) speedup for
|
Test | Status |
---|---|
⚙️ Existing Unit Tests | 🔘 None Found |
🌀 Generated Regression Tests | ✅ 14 Passed |
⏪ Replay Tests | 🔘 None Found |
🔎 Concolic Coverage Tests | 🔘 None Found |
📊 Tests Coverage | undefined |
🌀 Generated Regression Tests Details
from typing import Any, Dict, List
import pandas as pd
# imports
import pytest # used for our unit tests
from langchain_text_splitters import CharacterTextSplitter
from langflow.components.processing.split_text import SplitTextComponent
from langflow.custom import Component
from langflow.logging.logger import logger
from langflow.schema import Data, DataFrame
from pydantic import BaseModel
class DataFrame(pd.DataFrame):
"""A pandas DataFrame subclass specialized for handling collections of Data objects."""
def __init__(self, data: List[Dict[str, Any]] = None, text_key: str = "text", default_value: str = "", **kwargs):
super().__init__(data, **kwargs)
self._text_key = text_key
self._default_value = default_value
def to_lc_documents(self):
list_of_dicts = self.to_dict(orient="records")
documents = []
for row in list_of_dicts:
data_copy = row.copy()
text = data_copy.pop(self._text_key, self._default_value)
documents.append({"page_content": text, "metadata": data_copy})
return documents
class Data(BaseModel):
text_key: str = "text"
data: Dict[str, Any] = {}
default_value: str = ""
def to_lc_document(self):
data_copy = self.data.copy()
text = data_copy.pop(self.text_key, self.default_value)
return {"page_content": text, "metadata": data_copy}
from langflow.components.processing.split_text import SplitTextComponent
# unit tests
class TestSplitTextComponent:
# Helper function to create SplitTextComponent
def create_component(self, data_inputs, separator, chunk_size, chunk_overlap, text_key="text"):
component = SplitTextComponent()
component.data_inputs = data_inputs
component.separator = separator
component.chunk_size = chunk_size
component.chunk_overlap = chunk_overlap
component.text_key = text_key
return component
# Basic Functionality Tests
import pytest # used for our unit tests
from langchain_text_splitters import CharacterTextSplitter
from langflow.components.processing.split_text import SplitTextComponent
from langflow.custom import Component
from langflow.logging.logger import logger
from langflow.schema import Data, DataFrame
# unit tests
@pytest.fixture
def setup_component():
component = SplitTextComponent()
component.separator = "\\n"
component.chunk_overlap = 0
component.chunk_size = 5
component.text_key = "text"
return component
def test_empty_inputs_list(setup_component):
component = setup_component
component.data_inputs = []
with pytest.raises(TypeError, match="No data inputs provided"):
component.split_text()
def test_text_with_special_characters(setup_component):
component = setup_component
component.data_inputs = [Data(data={"text": "Hello,\nworld!"})]
codeflash_output = component.split_text()
def test_very_large_text(setup_component):
component = setup_component
component.data_inputs = [Data(data={"text": "A" * 10000})]
codeflash_output = component.split_text()
def test_invalid_data_types(setup_component):
component = setup_component
component.data_inputs = [{"text": "Hello, world!"}]
with pytest.raises(TypeError, match="Unsupported input type"):
component.split_text()
def test_different_chunk_sizes_and_overlaps(setup_component):
component = setup_component
component.chunk_size = 4
component.chunk_overlap = 2
component.data_inputs = [Data(data={"text": "Hello, world!"})]
codeflash_output = component.split_text()
def test_different_separators(setup_component):
component = setup_component
component.separator = " "
component.data_inputs = [Data(data={"text": "Hello world!"})]
codeflash_output = component.split_text()
def test_splitting_errors(setup_component):
component = setup_component
component.chunk_size = -1 # Invalid chunk size
component.data_inputs = [Data(data={"text": "Hello, world!"})]
with pytest.raises(TypeError, match="Error splitting text"):
component.split_text()
def test_large_number_of_documents(setup_component):
component = setup_component
component.data_inputs = [Data(data={"text": "Hello, world!"})] * 1000
codeflash_output = component.split_text()
def test_complex_nested_data(setup_component):
component = setup_component
component.data_inputs = [Data(data={"text": "Hello, world!", "metadata": {"author": "John"}})]
codeflash_output = component.split_text()
def test_state_modification(setup_component):
component = setup_component
original_data = [Data(data={"text": "Hello, world!"})]
component.data_inputs = original_data.copy()
component.split_text()
@mendonk |
…angflow into split_text_updates
This pull request includes significant changes to the
SplitTextComponent
and introduces a newSplitTextComponentLegacy
class. Additionally, there are updates to theDataFrame
class and corresponding unit tests to support new functionalities. The most important changes are summarized below:New Features and Enhancements:
New Component:
SplitTextComponentLegacy
to handle deprecated text splitting functionality. (src/backend/base/langflow/components/processing/split_text_legacy.py
)Enhanced SplitTextComponent:
SplitTextComponent
to support bothData
andDataFrame
input types, added a new input fieldtext_key
, and modified output methods to handle data conversion more robustly. (src/backend/base/langflow/components/processing/split_text.py
) [1] [2]Codebase Improvements:
DataFrame
class to include new attributestext_key
anddefault_value
, and added methods for converting to and from LangChainDocument
objects. (src/backend/base/langflow/schema/dataframe.py
) [1] [2]Refactoring and Bug Fixes:
split_text
method calls withas_data
in various components and tests to ensure consistency with the new method names. (src/backend/base/langflow/initial_setup/starter_projects/vector_store_rag.py
,src/backend/tests/unit/components/processing/test_split_text_component.py
,src/backend/tests/unit/initial_setup/starter_projects/test_vector_store_rag.py
) [1] [2] [3]Testing:
SplitTextComponent
andDataFrame
functionalities, including new test cases for DataFrame input and URL loader. (src/backend/tests/unit/components/processing/test_split_text_component.py
,src/backend/tests/unit/schema/test_schema_dataframe.py
) [1] [2]These changes enhance the flexibility and robustness of the text processing components and ensure that the codebase is well-tested and maintainable.
https://www.loom.com/share/0bab777a383749ffb03707a7e184e97c?sid=ff2da035-c8f5-4ea3-9c36-a1f0ea71d261