Skip to content

Commit

Permalink
update: add splitter tests (#21)
Browse files Browse the repository at this point in the history
* update: add splitter tests
* fix: bugfixing sadness

Signed-off-by: Milos Gajdos <[email protected]>

---------

Signed-off-by: Milos Gajdos <[email protected]>
  • Loading branch information
milosgajdos authored Mar 23, 2024
1 parent eacb758 commit 817f998
Show file tree
Hide file tree
Showing 6 changed files with 295 additions and 61 deletions.
23 changes: 10 additions & 13 deletions document/text/character.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ import (
// or a regular expression.
type CharSplitter struct {
*Splitter
sep string
isSepRegex bool
sep Sep
}

// NewSplitter creates a new splitter
Expand All @@ -30,22 +29,20 @@ func (s *CharSplitter) WithSplitter(splitter *Splitter) *CharSplitter {
}

// WithSep sets the separator.
func (s *CharSplitter) WithSep(sep string, isSepRegex bool) *CharSplitter {
func (s *CharSplitter) WithSep(sep Sep) *CharSplitter {
s.sep = sep
s.isSepRegex = isSepRegex
return nil
return s
}

// Split splits text into chunks.
func (s *CharSplitter) Split(text string) []string {
sep := s.sep
if !s.isSepRegex {
sep = regexp.QuoteMeta(s.sep)
sep := Sep{Value: s.sep.Value, IsRegexp: s.sep.IsRegexp}
if !sep.IsRegexp {
sep.Value = regexp.QuoteMeta(sep.Value)
}
chunks := s.splitText(text, sep)
sep = ""
if !s.keepSep {
sep = s.sep
splits := s.Splitter.Split(text, sep)
if s.keepSep {
sep.Value = ""
}
return s.merge(chunks, sep)
return s.merge(splits, sep)
}
110 changes: 110 additions & 0 deletions document/text/character_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package text

import (
"fmt"
"reflect"
"testing"
)

func TestCharSplitter(t *testing.T) {
t.Parallel()
var testCases = []struct {
size int
overlap int
trim bool
keepSep bool
sep Sep
input string
exp []string
}{
{
size: 7,
overlap: 3,
sep: Sep{Value: " "},
input: "foo bar baz 123",
exp: []string{"foo bar", "bar baz", "baz 123"},
},
{
size: 2,
overlap: 0,
sep: Sep{Value: " "},
input: "foo bar",
exp: []string{"foo", "bar"},
},
{
size: 3,
overlap: 1,
sep: Sep{Value: " "},
input: "foo bar baz a a",
exp: []string{"foo", "bar", "baz", "a a"},
},
{
size: 3,
overlap: 1,
sep: Sep{Value: " "},
input: "a a foo bar baz",
exp: []string{"a a", "foo", "bar", "baz"},
},
{
size: 1,
overlap: 1,
sep: Sep{Value: " "},
input: "foo bar baz 123",
exp: []string{"foo", "bar", "baz", "123"},
},
{
size: 1,
overlap: 0,
keepSep: true,
sep: Sep{Value: ".", IsRegexp: false},
input: "foo.bar.baz.123",
exp: []string{"foo", ".bar", ".baz", ".123"},
},
{
size: 1,
overlap: 0,
keepSep: true,
sep: Sep{Value: `\.`, IsRegexp: true},
input: "foo.bar.baz.123",
exp: []string{"foo", ".bar", ".baz", ".123"},
},
{
size: 1,
overlap: 0,
sep: Sep{Value: ".", IsRegexp: false},
input: "foo.bar.baz.123",
exp: []string{"foo", "bar", "baz", "123"},
},
{
size: 1,
overlap: 0,
sep: Sep{Value: `\.`, IsRegexp: true},
input: "foo.bar.baz.123",
exp: []string{"foo", "bar", "baz", "123"},
},
}

for _, tc := range testCases {
tc := tc
s := NewSplitterWithConfig(Config{
ChunkSize: tc.size,
ChunkOverlap: tc.overlap,
TrimSpace: tc.trim,
KeepSep: tc.keepSep,
LenFunc: DefaultLenFunc,
})
cs := NewCharSplitter().
WithSplitter(s).
WithSep(tc.sep)

t.Run(fmt.Sprintf("sep=%#v,size=%d,overlap=%d,trim=%v,keepSep=%v",
tc.sep, tc.size, tc.overlap, tc.trim, tc.keepSep),
func(t *testing.T) {
t.Parallel()
splits := cs.Split(tc.input)
if !reflect.DeepEqual(splits, tc.exp) {
t.Errorf("expected: %#v, got: %#v", tc.exp, splits)
}
})
}
}
34 changes: 16 additions & 18 deletions document/text/recursive.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ import (
// separators to find one that works.
type RecursiveCharSplitter struct {
*Splitter
seps []string
isSepRegex bool
seps []Sep
}

// NewSplitter creates a new splitter and returns it.
Expand All @@ -22,53 +21,52 @@ func NewRecursiveCharSplitter() *RecursiveCharSplitter {
}
}

// WithSplitter sets the splitter
// WithSplitter sets the splitter.
func (r *RecursiveCharSplitter) WithSplitter(splitter *Splitter) *RecursiveCharSplitter {
r.Splitter = splitter
return r
}

// WithSeps sets separators
func (r *RecursiveCharSplitter) WithSeps(seps []string, isSepRegex bool) *RecursiveCharSplitter {
// WithSeps sets separators.
func (r *RecursiveCharSplitter) WithSeps(seps []Sep) *RecursiveCharSplitter {
r.seps = seps
r.isSepRegex = isSepRegex
return nil
return r
}

func (r *RecursiveCharSplitter) split(text string, seps []string) []string {
func (r *RecursiveCharSplitter) split(text string, seps []Sep) []string {
var (
resChunks []string
newSeps []string
newSeps []Sep
)

sep := seps[len(seps)-1]

for i, s := range seps {
if !r.isSepRegex {
s = regexp.QuoteMeta(s)
if !s.IsRegexp {
s.Value = regexp.QuoteMeta(s.Value)
}
if s == "" {
if s.Value == "" {
sep = s
break
}
if match, _ := regexp.MatchString(s, text); match {
if match, _ := regexp.MatchString(s.Value, text); match {
sep = s
newSeps = seps[i+1:]
break
}
}

// TODO should we escape again? Seems weird.
newSep := sep
if !r.isSepRegex {
newSep = regexp.QuoteMeta(sep)
newSep := Sep{Value: sep.Value, IsRegexp: sep.IsRegexp}
if !sep.IsRegexp {
newSep.Value = regexp.QuoteMeta(sep.Value)
}
chunks := r.splitText(text, newSep)
chunks := r.Splitter.Split(text, newSep)

var goodChunks []string

if r.keepSep {
newSep = ""
newSep.Value = ""
}

for _, chunk := range chunks {
Expand Down
76 changes: 76 additions & 0 deletions document/text/recursive_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package text

import (
"fmt"
"reflect"
"testing"
)

func TestRecursiveCharSplitter(t *testing.T) {
t.Parallel()
var testCases = []struct {
size int
overlap int
trim bool
keepSep bool
seps []Sep
input string
exp []string
}{
{
size: 10,
overlap: 1,
trim: true,
keepSep: true,
seps: DefaultSeparators,
input: `Hi.` + "\n\n" + `I'm Harrison.` + "\n\n" + `How? Are? You?` + "\n" + `Okay then f f f f.
This is a weird text to write, but gotta test the splittingggg some how.
Bye!` + "\n\n" + `-H.`,
exp: []string{
"Hi.",
"I'm",
"Harrison.",
"How? Are?",
"You?",
"Okay then",
"f f f f.",
"This is a",
"weird",
"text to",
"write,",
"but gotta",
"test the",
"splitting",
"gggg",
"some how.",
"Bye!",
"-H.",
},
},
}

for _, tc := range testCases {
tc := tc
s := NewSplitterWithConfig(Config{
ChunkSize: tc.size,
ChunkOverlap: tc.overlap,
TrimSpace: tc.trim,
KeepSep: tc.keepSep,
LenFunc: DefaultLenFunc,
})
cs := NewRecursiveCharSplitter().
WithSplitter(s).
WithSeps(tc.seps)

t.Run(fmt.Sprintf("sep=%#v,size=%d,overlap=%d,trim=%v,keepSep=%v",
tc.seps, tc.size, tc.overlap, tc.trim, tc.keepSep),
func(t *testing.T) {
t.Parallel()
splits := cs.Split(tc.input)
if !reflect.DeepEqual(splits, tc.exp) {
t.Errorf("expected: %#v, got: %#v", tc.exp, splits)
}
})
}
}
Loading

0 comments on commit 817f998

Please sign in to comment.