Semantic text search with Vespa python API
Explore the MS MARCO full-text search application
- Connect to a running Vespa application
- Load the BERT model
- Define a query model
- Load labelled data
- Evaluate the query model
- Baseline query model
- Compare query model with baseline
- Collect training data
It is assumed that you have deployed the MS MARCO full-text vespa sample app in your local machine.
from vespa.application import Vespa
app = Vespa(url="http://localhost", port=8080)
Loading one of the many models available.
from sentence_transformers import SentenceTransformer
bert_model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")
Define a function that take a text as input and return a vector of floats as output.
import numpy as np
def normalized_bert_encoder(text):
vector = bert_model.encode([text])[0].tolist()
norm = np.linalg.norm(vector)
if norm > 0.0:
vector = vector / norm
return vector.tolist()
from vespa.query import Query, Union, WeakAnd, ANN, RankProfile
query_model = Query(
match_phase=Union(
WeakAnd(hits=1000),
ANN(
doc_vector="title_bert",
query_vector="tensor_bert",
embedding_model=normalized_bert_encoder,
hits=1000,
label="ann_title"
),
ANN(
doc_vector="body_bert",
query_vector="tensor_bert",
embedding_model=normalized_bert_encoder,
hits=1000,
label="ann_body"
)
),
rank_profile=RankProfile(name="bert_title_body_all")
)
At this point we can query our application:
query_results = app.query(query="this is a test", query_model=query_model)
I will load a sample of the test set that I used in my experiments.
import requests
import json
labelled_data = json.loads(
requests.get("https://thigm85.github.io/data/msmarco/labelled_data_msmarco_test_set.json").text
)
Here it is the first two queries with their respective relevant documents.
labelled_data[0:2]
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank
eval_metrics = [MatchRatio(), Recall(at = 100), ReciprocalRank(at = 100)]
evaluation = app.evaluate(
labelled_data=labelled_data,
eval_metrics=eval_metrics,
query_model=query_model,
id_field="id",
timeout=5
)
evaluation.head()
baseline_query_model = Query(
match_phase=WeakAnd(hits=1000),
rank_profile=RankProfile(name="bm25")
)
baseline_evaluation = app.evaluate(
labelled_data=labelled_data,
eval_metrics=eval_metrics,
query_model=baseline_query_model,
id_field="id",
timeout=5
)
baseline_evaluation.head()
from pandas import merge
eval_comparison = merge(left=baseline_evaluation, right=evaluation, on="query_id", suffixes=('_baseline', ''))
Match Ratio
eval_comparison[["match_ratio_value_baseline", "match_ratio_value"]].describe()
Recall
eval_comparison[["recall_100_value_baseline", "recall_100_value"]].describe()
Reciprocal rank comparison
eval_comparison[["reciprocal_rank_100_value_baseline", "reciprocal_rank_100_value"]].describe()
Sometimes it is helpful to visually see the difference between the two query models.
from pandas import concat
from plotnine import ggplot, geom_boxplot, aes
baseline_evaluation["query_model_name"] = "bm25"
evaluation["query_model_name"] = "bert"
data_plot = concat([evaluation, baseline_evaluation])
ggplot(data_plot) + geom_boxplot(aes(x='query_model_name', y='reciprocal_rank_100_value'))
When collecting training data, it is important to use a rank profile that applied a random ordering of the matched documents.
data_collection_query_model = Query(
match_phase=Union(
WeakAnd(hits=1000),
ANN(
doc_vector="title_bert",
query_vector="tensor_bert",
embedding_model=normalized_bert_encoder,
hits=1000,
label="ann_title"
),
ANN(
doc_vector="body_bert",
query_vector="tensor_bert",
embedding_model=normalized_bert_encoder,
hits=1000,
label="ann_body"
)
),
rank_profile=RankProfile(
name="collect_rank_features_embeddings",
list_features=True)
)
Once we have defined the data_collection_query_model
, we can collect data containing both relevant and random documents.
training_data = app.collect_training_data(
labelled_data=labelled_data,
id_field="id",
query_model=data_collection_query_model,
number_random_docs=99
)
training_data.head()
We can now create a figure that is similar to the one displayed in semantic text search tutorial that shows the MS MARCO bias toward term-matching signals like BM25.
import plotly.graph_objects as go
relevant_training_data = training_data[training_data["relevant"] == 1]
fig = go.Figure()
fig.add_trace(
go.Histogram(
x=training_data["bm25(body)"] + training_data["bm25(title)"],
histnorm='probability density',
name = "relevant + random"
)
)
fig.add_trace(
go.Histogram(
x=relevant_training_data["bm25(body)"] + relevant_training_data["bm25(title)"],
histnorm='probability density',
name = "relevant"
)
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()