Load data collected from Vespa

The dataset used here were created by collecting ranking features from Vespa associated with the labelled data released by the round 3 of the TREC-CORD competition.

vespa_cord19.head(2)
topic_id iteration cord_uid relevancy query query-rewrite query-vector question narrative fieldMatch(abstract) ... fieldLength(abstract) fieldLength(body_text) fieldLength(title) freshness(timestamp) nativeRank(abstract) nativeRank(abstract_t5) nativeRank(title) rawScore(specter_embedding) rawScore(abstract_embedding) rawScore(title_embedding)
0 1 0.5 010vptx3 2 coronavirus origin coronavirus origin origin COVID-19 information... (0.28812721371650696, 1.558979868888855, 0.481... what is the origin of COVID-19 seeking range of information about the SARS-Co... 0.111406 ... 0 0 0 0 0 0 0 0 0 0
1 1 2.0 p0kv1pht 1 coronavirus origin coronavirus origin origin COVID-19 information... (0.28812721371650696, 1.558979868888855, 0.481... what is the origin of COVID-19 seeking range of information about the SARS-Co... 0.094629 ... 0 0 0 0 0 0 0 0 0 0

2 rows × 172 columns

Ranking features available

There are 163 ranking features available.

features = [
    x for x in list(vespa_cord19.columns) if x not in [
        'topic_id', 'iteration', 'cord_uid', 'relevancy', 'binary_relevance', 'query', 
        'query-rewrite', 'query-vector', 'question', 'narrative'
    ]
]
print(len(features))
163

Simplify target label

The original labelled data has three types of label: 0, 1 and 2. To simplify we will consider just two labels here. The document is either relevant (label = 1) or irrelevant (label = 0)

vespa_cord19["binary_relevance"] = vespa_cord19.apply(lambda row: 1 if row["relevancy"] > 0 else 0, axis=1)
vespa_cord19[['relevancy', 'binary_relevance']].head()
relevancy binary_relevance
0 2 1
1 1 1
2 2 1
3 0 0
4 0 0

Define the model used

from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(penalty='none', fit_intercept=True)

Create custom score function

The function needs to have the following format: score_func(y, y_pred, **kwargs).

from math import log
from sklearn.metrics import make_scorer

def compute_log_prob(y, y_pred):
    return sum([target*log(prob) + (1-target)*log(1-prob) for prob, target in zip(y_pred, y)])

scorer = make_scorer(score_func=compute_log_prob, needs_proba=True)

Data used

y = vespa_cord19.binary_relevance
X = vespa_cord19[features]

Feature Selector

The code below was used to generate the sequential feature selection results that will be analysed in this report. It takes some hours to run on a machine with 32 CPUs.

import os
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

columns_to_write = [
    "avg_score", "ci_bound", "cv_scores", "feature_idx", 
    "feature_names", "std_dev", "std_err", "forward", "floating"
]
output_file_name = "sequential_feature.csv"
for forward in [True, False]:
    for floating in [False, True]:
        sfs = None
        sfs_df = None
        sfs = SFS(
            estimator=estimator, 
            k_features=(1,len(features)), 
            forward=forward, 
            floating=floating, 
            scoring=scorer,
            cv=4,
            n_jobs=-1,
            verbose=1
        )
        sfs = sfs.fit(X, y)
        sfs_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
        sfs_df["forward"] = forward
        sfs_df["floating"] = floating
        if not os.path.isfile(output_file_name):
            sfs_df.to_csv(output_file_name, index = False, header = True, columns=columns_to_write)
            columns = list(sfs_df.columns)
        else:
            sfs_df.to_csv(output_file_name, mode = "a", header=False, index = False, columns=columns_to_write)
        results.append(sfs)

Loading the pre-computed results

from pandas import read_csv

data = read_csv("sequential_feature.csv")
data.head(2)
avg_score ci_bound cv_scores feature_idx feature_names std_dev std_err forward floating
0 -2781.720568 82.261812 [-2755.82180826 -2795.84549409 -2856.86996371 ... (23,) ('fieldMatch(abstract).significance',) 51.318010 29.628467 True False
1 -2728.851119 68.207668 [-2778.41736416 -2698.51375201 -2762.13271275 ... (10, 23) ('fieldMatch(abstract).importance', 'fieldMatc... 42.550507 24.566547 True False

Utility functions

from pandas import DataFrame
def create_dfs(df):
    cv_scores, number_features, feature_names = [], [], []
    for idx, row in df.iterrows():
        scores = [float(x) for x in row["cv_scores"].lstrip("[").rstrip("]").split()]
        cv_scores.extend(scores)
        number_features.extend([len(row["feature_idx"].lstrip("(").rstrip(")").split(","))] * len(scores))
        feature_names.extend([row["feature_names"]] * len(scores)) 
    sequential_selection = DataFrame(
        data={
            "cv_scores": cv_scores, 
            "number_features": number_features, 
            "feature_names": feature_names
        }
    )
    features = sequential_selection[["number_features", "feature_names"]].sort_values('number_features').drop_duplicates(['number_features'])
    return sequential_selection, features
def display_feature_names(df, number_features):
    return [x.strip().strip("'") for x in df[df.number_features == number_features].iloc[0]["feature_names"].lstrip("(").rstrip(")").split(",")]

Forward sequential selection

forward_df = data[(data["forward"] == True) & (data["floating"] == False)]
forward_sequential_selection, forward_features = create_dfs(forward_df)
import plotly.express as px

fig = px.box(forward_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(forward_features, 7)
['fieldMatch(abstract)',
 'fieldMatch(abstract).importance',
 'fieldMatch(abstract).significance',
 'fieldMatch(body_text).absoluteProximity',
 'fieldMatch(body_text).fieldCompleteness',
 'textSimilarity(body_text).queryCoverage',
 'bm25(title)']

Forward selection with floating

forward_floating_df = data[(data["forward"] == True) & (data["floating"] == True)]
forward_floating_sequential_selection, forward_floating_features = create_dfs(forward_floating_df)
import plotly.express as px

fig = px.box(forward_floating_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(forward_floating_features, 7)
['fieldMatch(abstract)',
 'fieldMatch(abstract).importance',
 'fieldMatch(abstract).significance',
 'fieldMatch(body_text).absoluteProximity',
 'fieldMatch(body_text).fieldCompleteness',
 'textSimilarity(body_text).queryCoverage',
 'bm25(title)']

Backward sequential selection

backward_df = data[(data["forward"] == False) & (data["floating"] == False)]
backward_sequential_selection, backward_features = create_dfs(backward_df)
import plotly.express as px

fig = px.box(backward_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(backward_features, 7)
['fieldMatch(abstract).longestSequence',
 'fieldMatch(body_text).proximity',
 'fieldMatch(body_text).weight',
 'fieldMatch(title).matches',
 'textSimilarity(title).queryCoverage',
 'bm25(abstract)',
 'bm25(title)']

Backward selection with floating

backward_floating_df = data[(data["forward"] == False) & (data["floating"] == True)]
backward_floating_sequential_selection, backward_floating_features = create_dfs(backward_floating_df)
import plotly.express as px

fig = px.box(backward_floating_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(backward_floating_features, 7)
['fieldMatch(abstract).absoluteOccurrence',
 'fieldMatch(body_text).proximity',
 'fieldMatch(body_text).significance',
 'fieldMatch(body_text).weight',
 'textSimilarity(body_text).fieldCoverage',
 'textSimilarity(title).queryCoverage',
 'bm25(abstract)']

Summary

DataFrame(data = {
    "forward": display_feature_names(forward_features, 7), 
    "forward_floating": display_feature_names(forward_floating_features, 7),
    "backward": display_feature_names(backward_features, 7),
    "backward_floating": display_feature_names(backward_floating_features, 7)
})
forward forward_floating backward backward_floating
0 fieldMatch(abstract) fieldMatch(abstract) fieldMatch(abstract).longestSequence fieldMatch(abstract).absoluteOccurrence
1 fieldMatch(abstract).importance fieldMatch(abstract).importance fieldMatch(body_text).proximity fieldMatch(body_text).proximity
2 fieldMatch(abstract).significance fieldMatch(abstract).significance fieldMatch(body_text).weight fieldMatch(body_text).significance
3 fieldMatch(body_text).absoluteProximity fieldMatch(body_text).absoluteProximity fieldMatch(title).matches fieldMatch(body_text).weight
4 fieldMatch(body_text).fieldCompleteness fieldMatch(body_text).fieldCompleteness textSimilarity(title).queryCoverage textSimilarity(body_text).fieldCoverage
5 textSimilarity(body_text).queryCoverage textSimilarity(body_text).queryCoverage bm25(abstract) textSimilarity(title).queryCoverage
6 bm25(title) bm25(title) bm25(title) bm25(abstract)