Vespa text-image search with PyTorch feeder
Create, deploy and feed a text-image search app
This post describe how to define and deploy a Vespa image search app through Vespa python API. In addition, we create a Vespa feeder based on PyTorch Dataset/Dataloader.
Check the following posts for context about the data and the model used here:
Check pyvespa documentation for more info about Vespa python API.
from vespa.package import ApplicationPackage, Field, HNSW, RankProfile, QueryTypeField
app_package = ApplicationPackage(name="image_search")
app_package.schema.add_fields(
Field(
name="image_file_name",
type="string",
indexing=["summary", "attribute"]
),
Field(
name="vit_b_32_image",
type="tensor<float>(x[512])",
indexing=["attribute", "index"],
ann=HNSW(
distance_metric="euclidean",
max_links_per_node=16,
neighbors_to_explore_at_insert=500
)
)
)
app_package.schema.add_rank_profile(
RankProfile(
name="vit-b-32-similarity",
inherits="default",
first_phase="closeness(vit_b_32_image)"
)
)
app_package.query_profile_type.add_fields(
QueryTypeField(
name="ranking.features.query(vit_b_32_text)",
type="tensor<float>(x[512])"
)
)
from vespa.deployment import VespaDocker
vespa_docker = VespaDocker(disk_folder=os.environ["DISK_FOLDER"])
app = vespa_docker.deploy(application_package=app_package)
Create a custom Dataset that loads an image, transform it into a 512-dimension vector and return data into a pyvespa-compatible format.
import os
import glob
import ntpath
import torch
from torch.utils.data import Dataset
from PIL import Image
import clip
class ImageFeedDataset(Dataset):
def __init__(self, img_dir, image_embedding_name, model_name):
self.model, self.preprocess = clip.load(model_name)
self.img_dir = img_dir
self.image_file_names = glob.glob(os.path.join(img_dir, "*.jpg"))
self.image_embedding_name = image_embedding_name
def _from_image_to_vector(self, x):
with torch.no_grad():
image_features = self.model.encode_image(self.preprocess(x).unsqueeze(0)).float()
image_features /= image_features.norm(dim=-1, keepdim=True)
return image_features
def __len__(self):
return len(self.image_file_names)
def __getitem__(self, idx):
image_file_name = self.image_file_names[idx]
image = Image.open(image_file_name)
image = self._from_image_to_vector(image)
image_base_name = ntpath.basename(image_file_name)
return {
"id": image_base_name.split(".jpg")[0],
"fields": {
"image_file_name": image_base_name,
self.image_embedding_name: {"values": image.tolist()[0]}
}
}
image_dataset = ImageFeedDataset(
img_dir=os.environ["IMG_DIR"], # Folder containing image files
image_embedding_name="vit_b_32_image", # name of the Vespa field that will hold image embedding
model_name="ViT-B/32" # CLIP model name used to convert image into vector
)
dataloader
will make it possible for us to loop through the dataset batch_size
data points at a time. Since the objective is to feed the data to the vespa app
, we can set shuffle to False
. We also specify a custom collate_fn
function so that pyvespa-compatible format is preserved when batching.
from torch.utils.data import DataLoader
dataloader = DataLoader(image_dataset, batch_size=128, shuffle=False, collate_fn=lambda x: x)
Note that most of the time is spent creating the image embedding. So, pre-computing the embedding will provide a significant speed-up.
for idx, batch in enumerate(dataloader):
print("Iteration: {}/{}".format(idx, len(dataloader)))
app.feed_batch(batch=batch)