Evaluate text-image search app with Flickr 8k dataset
Create labeled data, text processor and evaluate with Vespa python API
This post creates a labeled dataset out of the Flicker 8k image-caption dataset, builds a text processor that uses a CLIP model to map a text query into the same 512-dimensional space used to represent images and evaluate different query models using the Vespa python API.
Check the previous three posts for context:
An (image, caption) pair will be considered relevant for our purposes if all three experts agreed on a relevance score equal to 4.
from pandas import read_csv
experts = read_csv(
os.path.join(os.environ["DATA_FOLDER"], "ExpertAnnotations.txt"),
sep = "\t",
header=None,
names=["image_file_name", "caption_id", "expert_1", "expert_2", "expert_3"]
)
experts.head()
experts_agreement_bool = experts.apply(
lambda x: x["expert_1"] == x["expert_2"] and x["expert_2"] == x["expert_3"],
axis=1
)
experts_agreement = experts[experts_agreement_bool][
["image_file_name", "caption_id", "expert_1"]
].rename(columns={"expert_1":"expert"})
experts_agreement.head()
experts_agreement["expert"].value_counts().sort_index()
captions = read_csv(
os.path.join(os.environ["DATA_FOLDER"], "Flickr8k.token.txt"),
sep="\t",
header=None,
names=["caption_id", "caption"]
)
captions.head()
def get_caption(caption_id, captions):
return captions[captions["caption_id"] == caption_id]["caption"].values[0]
relevant_data = experts_agreement[experts_agreement["expert"] == 4]
relevant_data.head(3)
from ntpath import basename
from pandas import DataFrame
labeled_data = DataFrame(
data={
"qid": list(range(relevant_data.shape[0])),
"query": [get_caption(
caption_id=x,
captions=captions
).replace(" ,", "").replace(" .", "") for x in list(relevant_data.caption_id)],
"doc_id": [basename(x) for x in list(relevant_data.image_file_name)],
"relevance": 1}
)
labeled_data.head()
Create a text processor to map a text string into the same 512-dimensional space used to embed the images.
import clip
import torch
class TextProcessor(object):
def __init__(self, model_name):
self.model, _ = clip.load(model_name)
def embed(self, text):
text_tokens = clip.tokenize(text)
with torch.no_grad():
text_features = model.encode_text(text_tokens).float()
text_features /= text_features.norm(dim=-1, keepdim=True)
return text_features.tolist()[0]
Define search evaluation metrics:
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank
eval_metrics = [
MatchRatio(),
Recall(at=5),
Recall(at=100),
ReciprocalRank(at=5),
ReciprocalRank(at=100)
]
Instantiate TextProcessor
with a specific CLIP model.
text_processor = TextProcessor(model_name="ViT-B/32")
Create a QueryModel
's to be evaluated. In this case we create two query models based on the ViT-B/32
CLIP model, one that sends the query
as it is and another that prepends the prompt "A photo of " to the query before sending it, as suggest in the original CLIP paper.
from vespa.query import QueryModel
def create_vespa_query(query, prompt = False):
if prompt:
query = "A photo of " + query.lower()
return {
'yql': 'select * from sources * where ([{"targetNumHits":100}]nearestNeighbor(vit_b_32_image,vit_b_32_text));',
'hits': 100,
'ranking.features.query(vit_b_32_text)': text_processor.embed(query),
'ranking.profile': 'vit-b-32-similarity',
'timeout': 10
}
query_model_1 = QueryModel(name="vit_b_32", body_function=create_vespa_query)
query_model_2 = QueryModel(name="vit_b_32_prompt", body_function=lambda x: create_vespa_query(x, prompt=True))
Create a connection to the Vespa instance:
app = Vespa(
url=os.environ["VESPA_END_POINT"],
cert = os.environ["PRIVATE_CERTIFICATE_PATH"]
)
Evaluate the query models using the labeled data and metrics defined earlier. The labeled data uses the image_file_name
and doc id.
from vespa.application import Vespa
result = app.evaluate(
labeled_data=labeled_data,
eval_metrics=eval_metrics,
query_model=[query_model_1, query_model_2],
id_field="image_file_name"
)
The results shows that there is a lot of improvements to be made on the pre-trained ViT-B/32
CLIP model.
result