-
Notifications
You must be signed in to change notification settings - Fork 690
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MB-59616: Adding vector_base64 field (#2012)
- Added a new field type called vector_base64. - Acts similar to vector in most cases. - When a new document arrives in the bleve layer, during the parsing of all its fields in processProperty, if the field mapping type is vector-base64, then its value is decoded into a vector field and processed like a vector. - The standard golang base64 library is used for the decode operation. --------- Co-authored-by: Abhinav Dangeti <abhinav@couchbase.com>
- Loading branch information
1 parent
6d02ec6
commit 757705e
Showing
8 changed files
with
475 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
// Copyright (c) 2024 Couchbase, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
//go:build vectors | ||
// +build vectors | ||
|
||
package document | ||
|
||
import ( | ||
"encoding/base64" | ||
"encoding/binary" | ||
"fmt" | ||
"math" | ||
|
||
"github.com/blevesearch/bleve/v2/size" | ||
index "github.com/blevesearch/bleve_index_api" | ||
) | ||
|
||
type VectorBase64Field struct { | ||
vectorField *VectorField | ||
base64Encoding string | ||
} | ||
|
||
func (n *VectorBase64Field) Size() int { | ||
return n.vectorField.Size() | ||
} | ||
|
||
func (n *VectorBase64Field) Name() string { | ||
return n.vectorField.Name() | ||
} | ||
|
||
func (n *VectorBase64Field) ArrayPositions() []uint64 { | ||
return n.vectorField.ArrayPositions() | ||
} | ||
|
||
func (n *VectorBase64Field) Options() index.FieldIndexingOptions { | ||
return n.vectorField.Options() | ||
} | ||
|
||
func (n *VectorBase64Field) NumPlainTextBytes() uint64 { | ||
return n.vectorField.NumPlainTextBytes() | ||
} | ||
|
||
func (n *VectorBase64Field) AnalyzedLength() int { | ||
return n.vectorField.AnalyzedLength() | ||
} | ||
|
||
func (n *VectorBase64Field) EncodedFieldType() byte { | ||
return 'e' | ||
} | ||
|
||
func (n *VectorBase64Field) AnalyzedTokenFrequencies() index.TokenFrequencies { | ||
return n.vectorField.AnalyzedTokenFrequencies() | ||
} | ||
|
||
func (n *VectorBase64Field) Analyze() { | ||
} | ||
|
||
func (n *VectorBase64Field) Value() []byte { | ||
return n.vectorField.Value() | ||
} | ||
|
||
func (n *VectorBase64Field) GoString() string { | ||
return fmt.Sprintf("&document.vectorFieldBase64Field{Name:%s, Options: %s, "+ | ||
"Value: %+v}", n.vectorField.Name(), n.vectorField.Options(), n.vectorField.Value()) | ||
} | ||
|
||
// For the sake of not polluting the API, we are keeping arrayPositions as a | ||
// parameter, but it is not used. | ||
func NewVectorBase64Field(name string, arrayPositions []uint64, vectorBase64 string, | ||
dims int, similarity, vectorIndexOptimizedFor string) (*VectorBase64Field, error) { | ||
|
||
vector, err := DecodeVector(vectorBase64) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return &VectorBase64Field{ | ||
vectorField: NewVectorFieldWithIndexingOptions(name, arrayPositions, | ||
vector, dims, similarity, | ||
vectorIndexOptimizedFor, DefaultVectorIndexingOptions), | ||
|
||
base64Encoding: vectorBase64, | ||
}, nil | ||
} | ||
|
||
// This function takes a base64 encoded string and decodes it into | ||
// a vector. | ||
func DecodeVector(encodedValue string) ([]float32, error) { | ||
|
||
// We first decode the encoded string into a byte array. | ||
decodedString, err := base64.StdEncoding.DecodeString(encodedValue) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// The array is expected to be divisible by 4 because each float32 | ||
// should occupy 4 bytes | ||
if len(decodedString)%size.SizeOfFloat32 != 0 { | ||
return nil, fmt.Errorf("Decoded byte array not divisible by %d", size.SizeOfFloat32) | ||
} | ||
dims := int(len(decodedString) / size.SizeOfFloat32) | ||
decodedVector := make([]float32, dims) | ||
|
||
// We iterate through the array 4 bytes at a time and convert each of | ||
// them to a float32 value by reading them in a little endian notation | ||
for i := 0; i < dims; i++ { | ||
bytes := decodedString[i*size.SizeOfFloat32 : (i+1)*size.SizeOfFloat32] | ||
decodedVector[i] = math.Float32frombits(binary.LittleEndian.Uint32(bytes)) | ||
} | ||
|
||
return decodedVector, nil | ||
} | ||
|
||
func (n *VectorBase64Field) Vector() []float32 { | ||
return n.vectorField.Vector() | ||
} | ||
|
||
func (n *VectorBase64Field) Dims() int { | ||
return n.vectorField.Dims() | ||
} | ||
|
||
func (n *VectorBase64Field) Similarity() string { | ||
return n.vectorField.Similarity() | ||
} | ||
|
||
func (n *VectorBase64Field) IndexOptimizedFor() string { | ||
return n.vectorField.IndexOptimizedFor() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
// Copyright (c) 2024 Couchbase, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
//go:build vectors | ||
// +build vectors | ||
|
||
package document | ||
|
||
import ( | ||
"bytes" | ||
"encoding/base64" | ||
"encoding/binary" | ||
"fmt" | ||
"math/rand" | ||
"testing" | ||
) | ||
|
||
func TestDecodeVector(t *testing.T) { | ||
vec := make([]float32, 2048) | ||
for i := range vec { | ||
vec[i] = rand.Float32() | ||
} | ||
|
||
vecBytes := bytifyVec(vec) | ||
encodedVec := base64.StdEncoding.EncodeToString(vecBytes) | ||
|
||
decodedVec, err := DecodeVector(encodedVec) | ||
if err != nil { | ||
t.Error(err) | ||
} | ||
if len(decodedVec) != len(vec) { | ||
t.Errorf("Decoded vector dimensions not same as original vector dimensions") | ||
} | ||
|
||
for i := range vec { | ||
if vec[i] != decodedVec[i] { | ||
t.Errorf("Decoded vector not the same as original vector") | ||
} | ||
} | ||
} | ||
|
||
func BenchmarkDecodeVector128(b *testing.B) { | ||
vec := make([]float32, 128) | ||
for i := range vec { | ||
vec[i] = rand.Float32() | ||
} | ||
|
||
vecBytes := bytifyVec(vec) | ||
encodedVec := base64.StdEncoding.EncodeToString(vecBytes) | ||
|
||
b.ResetTimer() | ||
|
||
for i := 0; i < b.N; i++ { | ||
_, _ = DecodeVector(encodedVec) | ||
} | ||
} | ||
|
||
func BenchmarkDecodeVector784(b *testing.B) { | ||
vec := make([]float32, 784) | ||
for i := range vec { | ||
vec[i] = rand.Float32() | ||
} | ||
|
||
vecBytes := bytifyVec(vec) | ||
encodedVec := base64.StdEncoding.EncodeToString(vecBytes) | ||
|
||
b.ResetTimer() | ||
|
||
for i := 0; i < b.N; i++ { | ||
_, _ = DecodeVector(encodedVec) | ||
} | ||
} | ||
|
||
func BenchmarkDecodeVector1536(b *testing.B) { | ||
vec := make([]float32, 1536) | ||
for i := range vec { | ||
vec[i] = rand.Float32() | ||
} | ||
|
||
vecBytes := bytifyVec(vec) | ||
encodedVec := base64.StdEncoding.EncodeToString(vecBytes) | ||
|
||
b.ResetTimer() | ||
|
||
for i := 0; i < b.N; i++ { | ||
_, _ = DecodeVector(encodedVec) | ||
} | ||
} | ||
|
||
func bytifyVec(vec []float32) []byte { | ||
|
||
buf := new(bytes.Buffer) | ||
|
||
for _, v := range vec { | ||
err := binary.Write(buf, binary.LittleEndian, v) | ||
if err != nil { | ||
fmt.Println(err) | ||
} | ||
} | ||
|
||
return buf.Bytes() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.