Connect to a running Vespa application

It is assumed that you have deployed the MS MARCO full-text vespa sample app in your local machine.

from vespa.application import Vespa

app = Vespa(url="http://localhost", port=8080)

Load the BERT model

Loading one of the many models available.

from sentence_transformers import SentenceTransformer

bert_model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

Define a function that take a text as input and return a vector of floats as output.

import numpy as np

def normalized_bert_encoder(text):
    vector = bert_model.encode([text])[0].tolist()
    norm = np.linalg.norm(vector)
    if norm > 0.0:
        vector = vector / norm
    return vector.tolist()

Define a query model

from vespa.query import Query, Union, WeakAnd, ANN, RankProfile

query_model = Query(
    match_phase=Union(
        WeakAnd(hits=1000), 
        ANN(
            doc_vector="title_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_title"
        ),
        ANN(
            doc_vector="body_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_body"
        )

    ),
    rank_profile=RankProfile(name="bert_title_body_all")
)

At this point we can query our application:

query_results = app.query(query="this is a test", query_model=query_model)

Load labelled data

I will load a sample of the test set that I used in my experiments.

import requests
import json

labelled_data = json.loads(
    requests.get("https://thigm85.github.io/data/msmarco/labelled_data_msmarco_test_set.json").text
)

Here it is the first two queries with their respective relevant documents.

labelled_data[0:2]
[{'query_id': '848370',
  'query': 'what is the state bird of rhode island',
  'relevant_docs': [{'id': 'D2533084'}]},
 {'query_id': '230835',
  'query': 'how far is dallas airport from downtown',
  'relevant_docs': [{'id': 'D1327232'}]}]

Evaluate the query model

from vespa.evaluation import MatchRatio, Recall, ReciprocalRank

eval_metrics = [MatchRatio(), Recall(at = 100), ReciprocalRank(at = 100)]
evaluation = app.evaluate(
    labelled_data=labelled_data, 
    eval_metrics=eval_metrics, 
    query_model=query_model, 
    id_field="id",
    timeout=5
)
evaluation.head()
query_id match_ratio_retrieved_docs match_ratio_docs_available match_ratio_value recall_100_value reciprocal_rank_100_value
0 848370 12738 99577 0.127921 0 0.000000
1 230835 12533 99577 0.125862 1 1.000000
2 915794 11523 99577 0.115719 0 0.000000
3 733084 20047 99577 0.201322 0 0.000000
4 23313 17423 99577 0.174970 1 0.111111

Baseline query model

baseline_query_model = Query(
    match_phase=WeakAnd(hits=1000),
    rank_profile=RankProfile(name="bm25")
)
baseline_evaluation = app.evaluate(
    labelled_data=labelled_data,
    eval_metrics=eval_metrics,
    query_model=baseline_query_model,
    id_field="id",
    timeout=5
)
baseline_evaluation.head()
query_id match_ratio_retrieved_docs match_ratio_docs_available match_ratio_value recall_100_value reciprocal_rank_100_value
0 848370 7160 99577 0.071904 1 1.000000
1 230835 7581 99577 0.076132 1 1.000000
2 915794 6067 99577 0.060928 1 0.142857
3 733084 15283 99577 0.153479 1 1.000000
4 23313 13424 99577 0.134810 1 0.333333

Compare query model with baseline

from pandas import merge

eval_comparison = merge(left=baseline_evaluation, right=evaluation, on="query_id", suffixes=('_baseline', ''))

Match Ratio

eval_comparison[["match_ratio_value_baseline", "match_ratio_value"]].describe()
match_ratio_value_baseline match_ratio_value
count 500.000000 500.000000
mean 0.124775 0.179036
std 0.116852 0.116558
min 0.000000 0.000000
25% 0.067169 0.122315
50% 0.083393 0.138973
75% 0.121597 0.180619
max 0.873666 0.887836

Recall

eval_comparison[["recall_100_value_baseline", "recall_100_value"]].describe()
recall_100_value_baseline recall_100_value
count 500.000000 500.000000
mean 0.842000 0.586000
std 0.365106 0.493042
min 0.000000 0.000000
25% 1.000000 0.000000
50% 1.000000 1.000000
75% 1.000000 1.000000
max 1.000000 1.000000

Reciprocal rank comparison

eval_comparison[["reciprocal_rank_100_value_baseline", "reciprocal_rank_100_value"]].describe()
reciprocal_rank_100_value_baseline reciprocal_rank_100_value
count 500.000000 500.000000
mean 0.675333 0.449417
std 0.413238 0.453405
min 0.000000 0.000000
25% 0.237500 0.000000
50% 1.000000 0.333333
75% 1.000000 1.000000
max 1.000000 1.000000

Sometimes it is helpful to visually see the difference between the two query models.

from pandas import concat
from plotnine import ggplot, geom_boxplot, aes

baseline_evaluation["query_model_name"] = "bm25"
evaluation["query_model_name"] = "bert"
data_plot = concat([evaluation, baseline_evaluation])

ggplot(data_plot) + geom_boxplot(aes(x='query_model_name', y='reciprocal_rank_100_value'))
<ggplot: (349088749)>

Collect training data

When collecting training data, it is important to use a rank profile that applied a random ordering of the matched documents.

data_collection_query_model = Query(
    match_phase=Union(
        WeakAnd(hits=1000), 
        ANN(
            doc_vector="title_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_title"
        ),
        ANN(
            doc_vector="body_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_body"
        )

    ),
    rank_profile=RankProfile(
        name="collect_rank_features_embeddings", 
        list_features=True)
)

Once we have defined the data_collection_query_model, we can collect data containing both relevant and random documents.

training_data = app.collect_training_data(
    labelled_data=labelled_data, 
    id_field="id", 
    query_model=data_collection_query_model, 
    number_random_docs=99
)
training_data.head()
bm25(body) bm25(title) nativeRank(body) nativeRank(title) rankingExpression(dot_product_body_bert) rankingExpression(dot_product_body_gse) rankingExpression(dot_product_body_word2vec) rankingExpression(dot_product_title_bert) rankingExpression(dot_product_title_gse) rankingExpression(dot_product_title_word2vec) document_id query_id relevant
0 26.474014 9.612229 0.314493 8.557759e-02 0.247331 0.0 0.0 0.562989 0.0 0.0 D2533084 848370 1
1 3.247162 0.000000 0.117618 1.351221e-38 0.080011 0.0 0.0 0.205397 0.0 0.0 D864574 848370 0
2 3.259604 0.000000 0.156522 1.351221e-38 0.011655 0.0 0.0 0.171949 0.0 0.0 D3246601 848370 0
3 1.853070 0.835559 0.121469 6.774715e-02 0.274763 0.0 0.0 0.292467 0.0 0.0 D3028705 848370 0
4 7.327304 0.000000 0.153262 1.351221e-38 0.045287 0.0 0.0 0.048524 0.0 0.0 D289677 848370 0

We can now create a figure that is similar to the one displayed in semantic text search tutorial that shows the MS MARCO bias toward term-matching signals like BM25.

import plotly.graph_objects as go

relevant_training_data = training_data[training_data["relevant"] == 1]

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=training_data["bm25(body)"] + training_data["bm25(title)"], 
        histnorm='probability density',
        name = "relevant + random"
    )
)
fig.add_trace(
    go.Histogram(
        x=relevant_training_data["bm25(body)"] + relevant_training_data["bm25(title)"], 
        histnorm='probability density',
        name = "relevant"
    )
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()