-
Notifications
You must be signed in to change notification settings - Fork 5.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add DataToDataFrame component for converting Data objects #6112
base: main
Are you sure you want to change the base?
Conversation
… into a DataFrame for easier data manipulation and analysis.
…me method to explain the process of building a DataFrame from Data objects
…on to ensure proper syntax and avoid potential errors
# If user passed a single Data, it might come in as a single object rather than a list | ||
if not isinstance(data_input, list): | ||
data_input = [data_input] | ||
|
||
rows = [] | ||
for item in data_input: | ||
if not isinstance(item, Data): | ||
msg = f"Expected Data objects, got {type(item)} instead." | ||
raise TypeError(msg) | ||
|
||
# Start with a copy of item.data or an empty dict | ||
row_dict = dict(item.data) if item.data else {} | ||
|
||
# If the Data object has text, store it under 'text' col | ||
text_val = item.get_text() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
# If user passed a single Data, it might come in as a single object rather than a list | |
if not isinstance(data_input, list): | |
data_input = [data_input] | |
rows = [] | |
for item in data_input: | |
if not isinstance(item, Data): | |
msg = f"Expected Data objects, got {type(item)} instead." | |
raise TypeError(msg) | |
# Start with a copy of item.data or an empty dict | |
row_dict = dict(item.data) if item.data else {} | |
# If the Data object has text, store it under 'text' col | |
text_val = item.get_text() | |
# Ensure data_input is a list | |
# Use list comprehension to create rows more efficiently | |
rows = [ | |
{ | |
**(item.data if item.data else {}), | |
**({"text": item.get_text()} if item.get_text() else {}), | |
} | |
for item in data_input | |
if isinstance(item, Data) | |
] | |
# Verify all items are Data objects and raise TypeError if not | |
if len(rows) != len(data_input): | |
raise TypeError("All input items must be Data objects.") |
⚡️ Codeflash found optimizations for this PR📄 16% (0.16x) speedup for
|
Test | Status |
---|---|
⚙️ Existing Unit Tests | 🔘 None Found |
🌀 Generated Regression Tests | ✅ 4 Passed |
⏪ Replay Tests | 🔘 None Found |
🔎 Concolic Coverage Tests | 🔘 None Found |
📊 Tests Coverage | undefined |
🌀 Generated Regression Tests Details
from typing import Any, Dict, List
import pandas as pd # used for DataFrame operations
# imports
import pytest # used for our unit tests
from langflow.components.processing.data_to_dataframe import \
DataToDataFrameComponent
from langflow.custom import Component
from langflow.schema import Data, DataFrame
# function to test
class DataFrame(pd.DataFrame):
"""A pandas DataFrame subclass specialized for handling collections of Data objects."""
def __init__(self, data: List[Dict] | List[Data] | pd.DataFrame | None = None, **kwargs):
if data is None:
super().__init__(**kwargs)
return
if isinstance(data, list):
if all(isinstance(x, Data) for x in data):
data = [d.data for d in data if hasattr(d, "data")]
elif not all(isinstance(x, dict) for x in data):
msg = "List items must be either all Data objects or all dictionaries"
raise ValueError(msg)
kwargs["data"] = data
elif isinstance(data, dict) or isinstance(data, pd.DataFrame):
kwargs["data"] = data
super().__init__(**kwargs)
def to_data_list(self) -> List[Data]:
list_of_dicts = self.to_dict(orient="records")
return [Data(data=row) for row in list_of_dicts]
def add_row(self, data: Dict | Data) -> "DataFrame":
if isinstance(data, Data):
data = data.data
new_df = self._constructor([data])
return pd.concat([self, new_df], ignore_index=True)
def add_rows(self, data: List[Dict | Data]) -> "DataFrame":
processed_data = []
for item in data:
if isinstance(item, Data):
processed_data.append(item.data)
else:
processed_data.append(item)
new_df = self._constructor(processed_data)
return pd.concat([self, new_df], ignore_index=True)
@property
def _constructor(self):
def _c(*args, **kwargs):
return DataFrame(*args, **kwargs).__finalize__(self)
return _c
def __bool__(self):
return not self.empty
class Data:
def __init__(self, data: Dict[str, Any] = None):
self.data = data or {}
self.text_key = "text"
self.default_value = ""
def get_text(self):
return self.data.get(self.text_key, self.default_value)
from langflow.components.processing.data_to_dataframe import \
DataToDataFrameComponent
# unit tests
# Basic Functionality Tests
def test_empty_input():
component = DataToDataFrameComponent()
component.data_list = []
codeflash_output = component.build_dataframe()
def test_non_data_objects_in_input():
component = DataToDataFrameComponent()
component.data_list = [Data(data={"name": "John"}), {"name": "Jane"}]
with pytest.raises(TypeError):
component.build_dataframe()
def test_invalid_input_types():
component = DataToDataFrameComponent()
component.data_list = "string"
with pytest.raises(TypeError):
component.build_dataframe()
def test_invalid_data_structures():
component = DataToDataFrameComponent()
component.data_list = Data(data="invalid_structure")
with pytest.raises(TypeError):
component.build_dataframe()
# Performance and Scalability Tests
from typing import Any, Dict, List
import pandas as pd
# imports
import pytest # used for our unit tests
from langflow.components.processing.data_to_dataframe import \
DataToDataFrameComponent
from pydantic import BaseModel
# function to test
class DataFrame(pd.DataFrame):
"""A pandas DataFrame subclass specialized for handling collections of Data objects."""
def __init__(self, data: List[Dict] | List["Data"] | pd.DataFrame | None = None, **kwargs):
if data is None:
super().__init__(**kwargs)
return
if isinstance(data, list):
if all(isinstance(x, Data) for x in data):
data = [d.data for d in data if hasattr(d, "data")]
elif not all(isinstance(x, dict) for x in data):
raise ValueError("List items must be either all Data objects or all dictionaries")
kwargs["data"] = data
elif isinstance(data, (dict, pd.DataFrame)):
kwargs["data"] = data
super().__init__(**kwargs)
def to_data_list(self) -> List["Data"]:
"""Converts the DataFrame back to a list of Data objects."""
list_of_dicts = self.to_dict(orient="records")
return [Data(data=row) for row in list_of_dicts]
def add_row(self, data: Dict | "Data") -> "DataFrame":
"""Adds a single row to the dataset."""
if isinstance(data, Data):
data = data.data
new_df = self._constructor([data])
return pd.concat([self, new_df], ignore_index=True)
def add_rows(self, data: List[Dict | "Data"]) -> "DataFrame":
"""Adds multiple rows to the dataset."""
processed_data = []
for item in data:
if isinstance(item, Data):
processed_data.append(item.data)
else:
processed_data.append(item)
new_df = self._constructor(processed_data)
return pd.concat([self, new_df], ignore_index=True)
@property
def _constructor(self):
def _c(*args, **kwargs):
return DataFrame(*args, **kwargs).__finalize__(self)
return _c
def __bool__(self):
"""Truth value testing for the DataFrame."""
return not self.empty
class Data(BaseModel):
"""Represents a record with text and optional data."""
text_key: str = "text"
data: Dict[str, Any] = {}
default_value: str | None = ""
def get_text(self):
"""Retrieves the text value from the data dictionary."""
return self.data.get(self.text_key, self.default_value)
def set_text(self, text: str | None) -> str:
"""Sets the text value in the data dictionary."""
new_text = "" if text is None else str(text)
self.data[self.text_key] = new_text
return new_text
from langflow.components.processing.data_to_dataframe import \
DataToDataFrameComponent
# unit tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey @Cristhianzl
All Component PRs need tests. Could you, please, add them?
…onent to ensure proper construction of DataFrame from Data objects with various fields and configurations.
…use pandas module instead of turtle for DataFrame operations ♻️ (test_data_to_dataframe.py): Refactor test_data_to_dataframe.py to improve readability and consistency in DataFrame testing assertions
…dability and consistency in test cases
Done! Thanks! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
This pull request introduces a new component to the
langflow
library that converts data objects into a DataFrame. The main addition is theDataToDataFrameComponent
class, which includes methods for transforming data and handling inputs and outputs.Key changes include:
DataToDataFrameComponent
class insrc/backend/base/langflow/components/processing/data_to_dataframe.py
to convert one or multipleData
objects into aDataFrame
. This class includes a detailed description, icon, name, inputs, outputs, and thebuild_dataframe
method for processing data.The new component enhances the library's data processing capabilities by allowing easy conversion of structured data into a DataFrame format.