Connect to a running Vespa application

It is assumed that you have deployed the MS MARCO full-text vespa sample app in your local machine.

from vespa.application import Vespa

app = Vespa(url="http://localhost", port=8080)

Load the BERT model

Loading one of the many models available.

from sentence_transformers import SentenceTransformer

bert_model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens")

Define a function that take a text as input and return a vector of floats as output.

import numpy as np

def normalized_bert_encoder(text):
    vector = bert_model.encode([text])[0].tolist()
    norm = np.linalg.norm(vector)
    if norm > 0.0:
        vector = vector / norm
    return vector.tolist()

Define a query model

from vespa.query import Query, Union, WeakAnd, ANN, RankProfile

query_model = Query(
    match_phase=Union(
        WeakAnd(hits=1000), 
        ANN(
            doc_vector="title_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_title"
        ),
        ANN(
            doc_vector="body_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_body"
        )

    ),
    rank_profile=RankProfile(name="bert_title_body_all")
)

At this point we can query our application:

query_results = app.query(query="this is a test", query_model=query_model)

Load labelled data

I will load a sample of the test set that I used in my experiments.

import requests
import json

labelled_data = json.loads(
    requests.get("https://thigm85.github.io/data/msmarco/labelled_data_msmarco_test_set.json").text
)

Here it is the first two queries with their respective relevant documents.

labelled_data[0:2]

[{'query_id': '848370',
  'query': 'what is the state bird of rhode island',
  'relevant_docs': [{'id': 'D2533084'}]},
 {'query_id': '230835',
  'query': 'how far is dallas airport from downtown',
  'relevant_docs': [{'id': 'D1327232'}]}]

Evaluate the query model

from vespa.evaluation import MatchRatio, Recall, ReciprocalRank

eval_metrics = [MatchRatio(), Recall(at = 100), ReciprocalRank(at = 100)]

evaluation = app.evaluate(
    labelled_data=labelled_data, 
    eval_metrics=eval_metrics, 
    query_model=query_model, 
    id_field="id",
    timeout=5
)

evaluation.head()

Baseline query model

baseline_query_model = Query(
    match_phase=WeakAnd(hits=1000),
    rank_profile=RankProfile(name="bm25")
)

baseline_evaluation = app.evaluate(
    labelled_data=labelled_data,
    eval_metrics=eval_metrics,
    query_model=baseline_query_model,
    id_field="id",
    timeout=5
)

baseline_evaluation.head()

Compare query model with baseline

from pandas import merge

eval_comparison = merge(left=baseline_evaluation, right=evaluation, on="query_id", suffixes=('_baseline', ''))

Match Ratio

eval_comparison[["match_ratio_value_baseline", "match_ratio_value"]].describe()

Recall

eval_comparison[["recall_100_value_baseline", "recall_100_value"]].describe()

Reciprocal rank comparison

eval_comparison[["reciprocal_rank_100_value_baseline", "reciprocal_rank_100_value"]].describe()

Sometimes it is helpful to visually see the difference between the two query models.

from pandas import concat
from plotnine import ggplot, geom_boxplot, aes

baseline_evaluation["query_model_name"] = "bm25"
evaluation["query_model_name"] = "bert"
data_plot = concat([evaluation, baseline_evaluation])

ggplot(data_plot) + geom_boxplot(aes(x='query_model_name', y='reciprocal_rank_100_value'))

<ggplot: (349088749)>

Collect training data

When collecting training data, it is important to use a rank profile that applied a random ordering of the matched documents.

data_collection_query_model = Query(
    match_phase=Union(
        WeakAnd(hits=1000), 
        ANN(
            doc_vector="title_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_title"
        ),
        ANN(
            doc_vector="body_bert", 
            query_vector="tensor_bert", 
            embedding_model=normalized_bert_encoder, 
            hits=1000, 
            label="ann_body"
        )

    ),
    rank_profile=RankProfile(
        name="collect_rank_features_embeddings", 
        list_features=True)
)

Once we have defined the data_collection_query_model, we can collect data containing both relevant and random documents.

training_data = app.collect_training_data(
    labelled_data=labelled_data, 
    id_field="id", 
    query_model=data_collection_query_model, 
    number_random_docs=99
)

training_data.head()

We can now create a figure that is similar to the one displayed in semantic text search tutorial that shows the MS MARCO bias toward term-matching signals like BM25.

import plotly.graph_objects as go

relevant_training_data = training_data[training_data["relevant"] == 1]

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=training_data["bm25(body)"] + training_data["bm25(title)"], 
        histnorm='probability density',
        name = "relevant + random"
    )
)
fig.add_trace(
    go.Histogram(
        x=relevant_training_data["bm25(body)"] + relevant_training_data["bm25(title)"], 
        histnorm='probability density',
        name = "relevant"
    )
)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

	query_id	match_ratio_retrieved_docs	match_ratio_docs_available	match_ratio_value	recall_100_value	reciprocal_rank_100_value
0	848370	12738	99577	0.127921	0	0.000000
1	230835	12533	99577	0.125862	1	1.000000
2	915794	11523	99577	0.115719	0	0.000000
3	733084	20047	99577	0.201322	0	0.000000
4	23313	17423	99577	0.174970	1	0.111111

	query_id	match_ratio_retrieved_docs	match_ratio_docs_available	match_ratio_value	recall_100_value	reciprocal_rank_100_value
0	848370	7160	99577	0.071904	1	1.000000
1	230835	7581	99577	0.076132	1	1.000000
2	915794	6067	99577	0.060928	1	0.142857
3	733084	15283	99577	0.153479	1	1.000000
4	23313	13424	99577	0.134810	1	0.333333

	match_ratio_value_baseline	match_ratio_value
count	500.000000	500.000000
mean	0.124775	0.179036
std	0.116852	0.116558
min	0.000000	0.000000
25%	0.067169	0.122315
50%	0.083393	0.138973
75%	0.121597	0.180619
max	0.873666	0.887836

	recall_100_value_baseline	recall_100_value
count	500.000000	500.000000
mean	0.842000	0.586000
std	0.365106	0.493042
min	0.000000	0.000000
25%	1.000000	0.000000
50%	1.000000	1.000000
75%	1.000000	1.000000
max	1.000000	1.000000

	reciprocal_rank_100_value_baseline	reciprocal_rank_100_value
count	500.000000	500.000000
mean	0.675333	0.449417
std	0.413238	0.453405
min	0.000000	0.000000
25%	0.237500	0.000000
50%	1.000000	0.333333
75%	1.000000	1.000000
max	1.000000	1.000000

	bm25(body)	bm25(title)	nativeRank(body)	nativeRank(title)	rankingExpression(dot_product_body_bert)	rankingExpression(dot_product_title_bert)	document_id	query_id	relevant
0	26.474014	9.612229	0.314493	8.557759e-02	0.247331	0.562989	D2533084	848370	1
1	3.247162	0.000000	0.117618	1.351221e-38	0.080011	0.205397	D864574	848370	0
2	3.259604	0.000000	0.156522	1.351221e-38	0.011655	0.171949	D3246601	848370	0
3	1.853070	0.835559	0.121469	6.774715e-02	0.274763	0.292467	D3028705	848370	0
4	7.327304	0.000000	0.153262	1.351221e-38	0.045287	0.048524	D289677	848370	0