This post describe how to define and deploy a Vespa image search app through Vespa python API. In addition, we create a Vespa feeder based on PyTorch Dataset/Dataloader.

Check the following posts for context about the data and the model used here:

Vespa image search app

Check pyvespa documentation for more info about Vespa python API.

Create application package

from vespa.package import ApplicationPackage, Field, HNSW, RankProfile, QueryTypeField

app_package = ApplicationPackage(name="image_search")

app_package.schema.add_fields(
    Field(
        name="image_file_name", 
        type="string", 
        indexing=["summary", "attribute"]
    ),
    Field(
        name="vit_b_32_image", 
        type="tensor<float>(x[512])", 
        indexing=["attribute", "index"], 
        ann=HNSW(
            distance_metric="euclidean", 
            max_links_per_node=16, 
            neighbors_to_explore_at_insert=500
        )
    )    
)

app_package.schema.add_rank_profile(
    RankProfile(
        name="vit-b-32-similarity", 
        inherits="default", 
        first_phase="closeness(vit_b_32_image)"
    )
)

app_package.query_profile_type.add_fields(
    QueryTypeField(
        name="ranking.features.query(vit_b_32_text)", 
        type="tensor<float>(x[512])"
    )
)

Deploy application

from vespa.deployment import VespaDocker

vespa_docker = VespaDocker(disk_folder=os.environ["DISK_FOLDER"])
app = vespa_docker.deploy(application_package=app_package)
Waiting for configuration server.
Waiting for configuration server.
Waiting for configuration server.
Waiting for configuration server.
Waiting for configuration server.
Waiting for configuration server.
Waiting for application status.
Waiting for application status.
Waiting for application status.
Finished deployment.

Feeding

Create pytorch ImageFeedDataset

Create a custom Dataset that loads an image, transform it into a 512-dimension vector and return data into a pyvespa-compatible format.

import os
import glob
import ntpath
import torch
from torch.utils.data import Dataset
from PIL import Image
import clip


class ImageFeedDataset(Dataset):
    def __init__(self, img_dir, image_embedding_name, model_name):
        self.model, self.preprocess = clip.load(model_name)        
        self.img_dir = img_dir
        self.image_file_names = glob.glob(os.path.join(img_dir, "*.jpg"))
        self.image_embedding_name = image_embedding_name

    def _from_image_to_vector(self, x):
        with torch.no_grad():
            image_features = self.model.encode_image(self.preprocess(x).unsqueeze(0)).float()
            image_features /= image_features.norm(dim=-1, keepdim=True)
        return image_features
        
    def __len__(self):
        return len(self.image_file_names)

    def __getitem__(self, idx):
        image_file_name = self.image_file_names[idx]
        image = Image.open(image_file_name)
        image = self._from_image_to_vector(image)
        image_base_name = ntpath.basename(image_file_name)
        return {
            "id": image_base_name.split(".jpg")[0], 
            "fields": {
                "image_file_name": image_base_name, 
                self.image_embedding_name: {"values": image.tolist()[0]}
            }
        }

Instantiate Dataset and DataLoader

image_dataset = ImageFeedDataset(
    img_dir=os.environ["IMG_DIR"],  # Folder containing image files     
    image_embedding_name="vit_b_32_image",  # name of the Vespa field that will hold image embedding
    model_name="ViT-B/32" # CLIP model name used to convert image into vector
)

dataloader will make it possible for us to loop through the dataset batch_size data points at a time. Since the objective is to feed the data to the vespa app, we can set shuffle to False. We also specify a custom collate_fn function so that pyvespa-compatible format is preserved when batching.

from torch.utils.data import DataLoader

dataloader = DataLoader(image_dataset, batch_size=128, shuffle=False, collate_fn=lambda x: x)

Feed the data

Note that most of the time is spent creating the image embedding. So, pre-computing the embedding will provide a significant speed-up.

for idx, batch in enumerate(dataloader):
    print("Iteration: {}/{}".format(idx, len(dataloader)))
    app.feed_batch(batch=batch)
Iteration: 0/64
Iteration: 1/64
Iteration: 2/64
Iteration: 3/64
Iteration: 4/64
Iteration: 5/64
Iteration: 6/64
Iteration: 7/64
Iteration: 8/64
Iteration: 9/64
Iteration: 10/64
Iteration: 11/64
Iteration: 12/64
Iteration: 13/64
Iteration: 14/64
Iteration: 15/64
Iteration: 16/64
Iteration: 17/64
Iteration: 18/64
Iteration: 19/64
Iteration: 20/64
Iteration: 21/64
Iteration: 22/64
Iteration: 23/64
Iteration: 24/64
Iteration: 25/64
Iteration: 26/64
Iteration: 27/64
Iteration: 28/64
Iteration: 29/64
Iteration: 30/64
Iteration: 31/64
Iteration: 32/64
Iteration: 33/64
Iteration: 34/64
Iteration: 35/64
Iteration: 36/64
Iteration: 37/64
Iteration: 38/64
Iteration: 39/64
Iteration: 40/64
Iteration: 41/64
Iteration: 42/64
Iteration: 43/64
Iteration: 44/64
Iteration: 45/64
Iteration: 46/64
Iteration: 47/64
Iteration: 48/64
Iteration: 49/64
Iteration: 50/64
Iteration: 51/64
Iteration: 52/64
Iteration: 53/64
Iteration: 54/64
Iteration: 55/64
Iteration: 56/64
Iteration: 57/64
Iteration: 58/64
Iteration: 59/64
Iteration: 60/64
Iteration: 61/64
Iteration: 62/64
Iteration: 63/64