Load data collected from Vespa

The dataset used here were created by collecting ranking features from Vespa associated with the labelled data released by the round 3 of the TREC-CORD competition.

vespa_cord19.to_csv("data/2020-05-27-subset-selection/training_features.csv", index=False)

vespa_cord19.head(2)

Ranking features available

There are 163 ranking features available.

features = [
    x for x in list(vespa_cord19.columns) if x not in [
        'topic_id', 'iteration', 'cord_uid', 'relevancy', 'binary_relevance', 'query', 
        'query-rewrite', 'query-vector', 'question', 'narrative'
    ]
]
print(len(features))

163

features

['fieldMatch(abstract)',
 'fieldMatch(abstract).absoluteOccurrence',
 'fieldMatch(abstract).absoluteProximity',
 'fieldMatch(abstract).completeness',
 'fieldMatch(abstract).degradedMatches',
 'fieldMatch(abstract).earliness',
 'fieldMatch(abstract).fieldCompleteness',
 'fieldMatch(abstract).gapLength',
 'fieldMatch(abstract).gaps',
 'fieldMatch(abstract).head',
 'fieldMatch(abstract).importance',
 'fieldMatch(abstract).longestSequence',
 'fieldMatch(abstract).longestSequenceRatio',
 'fieldMatch(abstract).matches',
 'fieldMatch(abstract).occurrence',
 'fieldMatch(abstract).orderness',
 'fieldMatch(abstract).outOfOrder',
 'fieldMatch(abstract).proximity',
 'fieldMatch(abstract).queryCompleteness',
 'fieldMatch(abstract).relatedness',
 'fieldMatch(abstract).segmentDistance',
 'fieldMatch(abstract).segmentProximity',
 'fieldMatch(abstract).segments',
 'fieldMatch(abstract).significance',
 'fieldMatch(abstract).significantOccurrence',
 'fieldMatch(abstract).tail',
 'fieldMatch(abstract).unweightedProximity',
 'fieldMatch(abstract).weight',
 'fieldMatch(abstract).weightedAbsoluteOccurrence',
 'fieldMatch(abstract).weightedOccurrence',
 'fieldMatch(abstract_t5)',
 'fieldMatch(abstract_t5).absoluteOccurrence',
 'fieldMatch(abstract_t5).absoluteProximity',
 'fieldMatch(abstract_t5).completeness',
 'fieldMatch(abstract_t5).degradedMatches',
 'fieldMatch(abstract_t5).earliness',
 'fieldMatch(abstract_t5).fieldCompleteness',
 'fieldMatch(abstract_t5).gapLength',
 'fieldMatch(abstract_t5).gaps',
 'fieldMatch(abstract_t5).head',
 'fieldMatch(abstract_t5).importance',
 'fieldMatch(abstract_t5).longestSequence',
 'fieldMatch(abstract_t5).longestSequenceRatio',
 'fieldMatch(abstract_t5).matches',
 'fieldMatch(abstract_t5).occurrence',
 'fieldMatch(abstract_t5).orderness',
 'fieldMatch(abstract_t5).outOfOrder',
 'fieldMatch(abstract_t5).proximity',
 'fieldMatch(abstract_t5).queryCompleteness',
 'fieldMatch(abstract_t5).relatedness',
 'fieldMatch(abstract_t5).segmentDistance',
 'fieldMatch(abstract_t5).segmentProximity',
 'fieldMatch(abstract_t5).segments',
 'fieldMatch(abstract_t5).significance',
 'fieldMatch(abstract_t5).significantOccurrence',
 'fieldMatch(abstract_t5).tail',
 'fieldMatch(abstract_t5).unweightedProximity',
 'fieldMatch(abstract_t5).weight',
 'fieldMatch(abstract_t5).weightedAbsoluteOccurrence',
 'fieldMatch(abstract_t5).weightedOccurrence',
 'fieldMatch(body_text)',
 'fieldMatch(body_text).absoluteOccurrence',
 'fieldMatch(body_text).absoluteProximity',
 'fieldMatch(body_text).completeness',
 'fieldMatch(body_text).degradedMatches',
 'fieldMatch(body_text).earliness',
 'fieldMatch(body_text).fieldCompleteness',
 'fieldMatch(body_text).gapLength',
 'fieldMatch(body_text).gaps',
 'fieldMatch(body_text).head',
 'fieldMatch(body_text).importance',
 'fieldMatch(body_text).longestSequence',
 'fieldMatch(body_text).longestSequenceRatio',
 'fieldMatch(body_text).matches',
 'fieldMatch(body_text).occurrence',
 'fieldMatch(body_text).orderness',
 'fieldMatch(body_text).outOfOrder',
 'fieldMatch(body_text).proximity',
 'fieldMatch(body_text).queryCompleteness',
 'fieldMatch(body_text).relatedness',
 'fieldMatch(body_text).segmentDistance',
 'fieldMatch(body_text).segmentProximity',
 'fieldMatch(body_text).segments',
 'fieldMatch(body_text).significance',
 'fieldMatch(body_text).significantOccurrence',
 'fieldMatch(body_text).tail',
 'fieldMatch(body_text).unweightedProximity',
 'fieldMatch(body_text).weight',
 'fieldMatch(body_text).weightedAbsoluteOccurrence',
 'fieldMatch(body_text).weightedOccurrence',
 'fieldMatch(title)',
 'fieldMatch(title).absoluteOccurrence',
 'fieldMatch(title).absoluteProximity',
 'fieldMatch(title).completeness',
 'fieldMatch(title).degradedMatches',
 'fieldMatch(title).earliness',
 'fieldMatch(title).fieldCompleteness',
 'fieldMatch(title).gapLength',
 'fieldMatch(title).gaps',
 'fieldMatch(title).head',
 'fieldMatch(title).importance',
 'fieldMatch(title).longestSequence',
 'fieldMatch(title).longestSequenceRatio',
 'fieldMatch(title).matches',
 'fieldMatch(title).occurrence',
 'fieldMatch(title).orderness',
 'fieldMatch(title).outOfOrder',
 'fieldMatch(title).proximity',
 'fieldMatch(title).queryCompleteness',
 'fieldMatch(title).relatedness',
 'fieldMatch(title).segmentDistance',
 'fieldMatch(title).segmentProximity',
 'fieldMatch(title).segments',
 'fieldMatch(title).significance',
 'fieldMatch(title).significantOccurrence',
 'fieldMatch(title).tail',
 'fieldMatch(title).unweightedProximity',
 'fieldMatch(title).weight',
 'fieldMatch(title).weightedAbsoluteOccurrence',
 'fieldMatch(title).weightedOccurrence',
 'nativeFieldMatch',
 'nativeProximity',
 'nativeRank',
 'textSimilarity(abstract).fieldCoverage',
 'textSimilarity(abstract).order',
 'textSimilarity(abstract).proximity',
 'textSimilarity(abstract).queryCoverage',
 'textSimilarity(abstract).score',
 'textSimilarity(abstract_t5).fieldCoverage',
 'textSimilarity(abstract_t5).order',
 'textSimilarity(abstract_t5).proximity',
 'textSimilarity(abstract_t5).queryCoverage',
 'textSimilarity(abstract_t5).score',
 'textSimilarity(body_text).fieldCoverage',
 'textSimilarity(body_text).order',
 'textSimilarity(body_text).proximity',
 'textSimilarity(body_text).queryCoverage',
 'textSimilarity(body_text).score',
 'textSimilarity(body_text_t5).fieldCoverage',
 'textSimilarity(body_text_t5).order',
 'textSimilarity(body_text_t5).proximity',
 'textSimilarity(body_text_t5).queryCoverage',
 'textSimilarity(body_text_t5).score',
 'textSimilarity(title).fieldCoverage',
 'textSimilarity(title).order',
 'textSimilarity(title).proximity',
 'textSimilarity(title).queryCoverage',
 'textSimilarity(title).score',
 'attribute(has_full_text)',
 'bm25(abstract)',
 'bm25(abstract_t5)',
 'bm25(body_text)',
 'bm25(title)',
 'fieldLength(abstract)',
 'fieldLength(body_text)',
 'fieldLength(title)',
 'freshness(timestamp)',
 'nativeRank(abstract)',
 'nativeRank(abstract_t5)',
 'nativeRank(title)',
 'rawScore(specter_embedding)',
 'rawScore(abstract_embedding)',
 'rawScore(title_embedding)']

Simplify target label

The original labelled data has three types of label: 0, 1 and 2. To simplify we will consider just two labels here. The document is either relevant (label = 1) or irrelevant (label = 0)

vespa_cord19["binary_relevance"] = vespa_cord19.apply(lambda row: 1 if row["relevancy"] > 0 else 0, axis=1)
vespa_cord19[['relevancy', 'binary_relevance']].head()

Model

We are going to fit logistic regressions with the objective of maximizing the log probability of the observed outcome.

from sklearn.linear_model import LogisticRegression
from statistics import mean

def compute_mean_realize_log_prob(model, X, Y):
    return mean([x[int(y)] for x, y in zip(model.predict_log_proba(X), Y)])
    
def fit_logistic_reg(X, Y):
    model = LogisticRegression(penalty='none', fit_intercept=True)
    model.fit(X, Y)
    realized_log_prob = compute_mean_realize_log_prob(model, X, Y)
    return realized_log_prob

Subset selection routine

Below we run the subset selection algorithm with only one feature.

import itertools
import pandas as pd
from tqdm import tnrange, tqdm_notebook #Importing tqdm for the progress bar
from tqdm.notebook import trange

log_probs, feature_list = [], []
numb_features = []
max_number_features = min(1, len(features))

data = vespa_cord19
Y = data.binary_relevance
X = data[features]

for k in range(1,max_number_features + 1):
    for combo in itertools.combinations(X.columns,k):
        tmp_result = fit_logistic_reg(X[list(combo)],Y)   
        log_probs.append(tmp_result)                   
        feature_list.append(combo)
        numb_features.append(len(combo))   


#Store in DataFrame
df = pd.DataFrame(
    {
        'numb_features': numb_features,
        'log_probs': log_probs,
        'features':feature_list
    }
)

Analyze results

fine-grained results

df

Plot average results across data samples

df['max_log_probs'] = df.groupby('numb_features')['log_probs'].transform(max)
df

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

plt.scatter(df.numb_features,df.log_probs, alpha = .2, color = 'darkblue')
plt.xlabel('# Features')
plt.ylabel('log_probs')
plt.title('Best subset selection')
plt.plot(df.numb_features,df.max_log_probs, color = 'r', label = 'Best subset')

plt.show()

Display the best features for each model size

df_max = df.sort_values('log_probs', ascending=False)

for f in df_max.features:
    print(f)

('fieldMatch(abstract).significance',)
('fieldMatch(abstract).importance',)
('fieldMatch(abstract).completeness',)
('fieldMatch(abstract).queryCompleteness',)
('fieldMatch(abstract).weight',)
('bm25(abstract)',)
('textSimilarity(abstract).queryCoverage',)
('bm25(title)',)
('fieldMatch(title).significance',)
('fieldMatch(title).importance',)
('fieldMatch(title).completeness',)
('fieldMatch(title).weight',)
('fieldMatch(title).queryCompleteness',)
('textSimilarity(title).queryCoverage',)
('fieldMatch(abstract).matches',)
('fieldMatch(title).matches',)
('fieldMatch(title)',)
('nativeRank',)
('nativeProximity',)
('fieldMatch(abstract_t5).significance',)
('nativeFieldMatch',)
('fieldMatch(abstract_t5).importance',)
('fieldMatch(abstract).segments',)
('bm25(abstract_t5)',)
('fieldMatch(abstract_t5).completeness',)
('fieldMatch(abstract_t5).queryCompleteness',)
('fieldMatch(abstract_t5).weight',)
('textSimilarity(title).fieldCoverage',)
('fieldMatch(title).occurrence',)
('fieldMatch(title).fieldCompleteness',)
('fieldMatch(abstract_t5).matches',)
('textSimilarity(abstract_t5).queryCoverage',)
('fieldMatch(title).longestSequence',)
('fieldMatch(abstract_t5).fieldCompleteness',)
('fieldMatch(abstract)',)
('fieldMatch(title).gaps',)
('fieldMatch(title).earliness',)
('fieldMatch(abstract_t5).segments',)
('textSimilarity(abstract_t5).fieldCoverage',)
('textSimilarity(title).score',)
('textSimilarity(abstract).score',)
('fieldMatch(title).segments',)
('textSimilarity(abstract).fieldCoverage',)
('fieldMatch(abstract_t5).occurrence',)
('fieldMatch(title).gapLength',)
('fieldMatch(title).absoluteOccurrence',)
('fieldMatch(title).weightedAbsoluteOccurrence',)
('fieldMatch(abstract).occurrence',)
('fieldMatch(abstract).fieldCompleteness',)
('textSimilarity(title).proximity',)
('fieldMatch(abstract_t5)',)
('fieldMatch(abstract).longestSequence',)
('fieldMatch(abstract_t5).longestSequence',)
('textSimilarity(abstract_t5).score',)
('fieldMatch(title).outOfOrder',)
('fieldMatch(abstract).gapLength',)
('textSimilarity(title).order',)
('fieldMatch(abstract_t5).segmentDistance',)
('fieldMatch(abstract).gaps',)
('fieldMatch(title).relatedness',)
('fieldMatch(abstract_t5).earliness',)
('textSimilarity(abstract_t5).order',)
('fieldMatch(title).segmentProximity',)
('textSimilarity(abstract_t5).proximity',)
('fieldMatch(title).unweightedProximity',)
('fieldMatch(title).proximity',)
('fieldMatch(title).absoluteProximity',)
('fieldMatch(title).segmentDistance',)
('textSimilarity(abstract).proximity',)
('fieldMatch(title).orderness',)
('fieldMatch(abstract_t5).absoluteOccurrence',)
('fieldMatch(abstract_t5).weightedAbsoluteOccurrence',)
('fieldMatch(abstract_t5).weightedOccurrence',)
('fieldMatch(abstract_t5).significantOccurrence',)
('fieldMatch(abstract).earliness',)
('fieldMatch(abstract_t5).gapLength',)
('fieldMatch(body_text).absoluteProximity',)
('fieldMatch(body_text).proximity',)
('fieldMatch(body_text).unweightedProximity',)
('fieldMatch(abstract_t5).orderness',)
('fieldMatch(abstract_t5).gaps',)
('fieldMatch(body_text).segmentProximity',)
('textSimilarity(abstract).order',)
('fieldMatch(title).significantOccurrence',)
('fieldMatch(title).weightedOccurrence',)
('fieldMatch(body_text).head',)
('fieldMatch(body_text).tail',)
('fieldMatch(abstract_t5).unweightedProximity',)
('fieldMatch(abstract_t5).absoluteProximity',)
('fieldMatch(abstract_t5).proximity',)
('fieldMatch(abstract).outOfOrder',)
('fieldMatch(body_text).longestSequenceRatio',)
('fieldMatch(body_text).gaps',)
('fieldMatch(abstract).longestSequenceRatio',)
('fieldMatch(body_text).orderness',)
('fieldMatch(abstract).segmentDistance',)
('fieldMatch(body_text).gapLength',)
('fieldMatch(body_text).segmentDistance',)
('fieldMatch(abstract_t5).outOfOrder',)
('fieldMatch(abstract).segmentProximity',)
('fieldMatch(abstract).absoluteOccurrence',)
('fieldMatch(abstract).weightedAbsoluteOccurrence',)
('fieldMatch(body_text).outOfOrder',)
('fieldMatch(title).tail',)
('fieldMatch(body_text).earliness',)
('fieldMatch(abstract).orderness',)
('fieldMatch(title).longestSequenceRatio',)
('textSimilarity(body_text).order',)
('fieldMatch(abstract).significantOccurrence',)
('fieldMatch(abstract).weightedOccurrence',)
('bm25(body_text)',)
('textSimilarity(body_text).fieldCoverage',)
('fieldMatch(abstract).tail',)
('fieldMatch(body_text).fieldCompleteness',)
('fieldMatch(body_text)',)
('fieldMatch(title).head',)
('fieldMatch(abstract_t5).relatedness',)
('fieldMatch(abstract_t5).tail',)
('fieldMatch(abstract_t5).segmentProximity',)
('fieldMatch(body_text).relatedness',)
('fieldMatch(body_text).weightedAbsoluteOccurrence',)
('fieldMatch(body_text).absoluteOccurrence',)
('fieldMatch(body_text).weightedOccurrence',)
('fieldMatch(abstract).head',)
('fieldMatch(abstract).unweightedProximity',)
('fieldMatch(abstract).absoluteProximity',)
('fieldMatch(abstract).proximity',)
('fieldMatch(body_text).significantOccurrence',)
('fieldMatch(body_text).occurrence',)
('fieldMatch(body_text).segments',)
('fieldMatch(abstract_t5).head',)
('textSimilarity(body_text).queryCoverage',)
('fieldMatch(body_text).longestSequence',)
('fieldMatch(body_text).significance',)
('fieldMatch(body_text).completeness',)
('fieldMatch(body_text).queryCompleteness',)
('fieldMatch(body_text).importance',)
('textSimilarity(body_text).proximity',)
('fieldMatch(body_text).weight',)
('fieldMatch(abstract_t5).longestSequenceRatio',)
('fieldMatch(body_text).matches',)
('fieldMatch(abstract).relatedness',)
('textSimilarity(body_text).score',)
('nativeRank(title)',)
('nativeRank(abstract)',)
('freshness(timestamp)',)
('rawScore(specter_embedding)',)
('rawScore(abstract_embedding)',)
('fieldLength(title)',)
('fieldLength(body_text)',)
('fieldLength(abstract)',)
('nativeRank(abstract_t5)',)
('fieldMatch(abstract_t5).degradedMatches',)
('fieldMatch(abstract).degradedMatches',)
('fieldMatch(title).degradedMatches',)
('attribute(has_full_text)',)
('fieldMatch(body_text).degradedMatches',)
('textSimilarity(body_text_t5).score',)
('textSimilarity(body_text_t5).queryCoverage',)
('textSimilarity(body_text_t5).proximity',)
('textSimilarity(body_text_t5).order',)
('textSimilarity(body_text_t5).fieldCoverage',)
('rawScore(title_embedding)',)

Conclusion

Using the predicting performance of individual features does not seem a good approach to eliminate features from a grid search by greedy algorithms. The reason is that many features that perform poorly when considered in isolation would shine when combined with other complementary features.

	numb_features	log_probs	features
0	1	-0.553448	(fieldMatch(abstract),)
1	1	-0.568826	(fieldMatch(abstract).absoluteOccurrence,)
2	1	-0.570800	(fieldMatch(abstract).absoluteProximity,)
3	1	-0.536799	(fieldMatch(abstract).completeness,)
4	1	-0.571269	(fieldMatch(abstract).degradedMatches,)
...	...	...	...
158	1	-0.571269	(nativeRank(abstract_t5),)
159	1	-0.571269	(nativeRank(title),)
160	1	-0.571269	(rawScore(specter_embedding),)
161	1	-0.571269	(rawScore(abstract_embedding),)
162	1	-0.571269	(rawScore(title_embedding),)

	numb_features	log_probs	features	max_log_probs
0	1	-0.553448	(fieldMatch(abstract),)	-0.535476
1	1	-0.568826	(fieldMatch(abstract).absoluteOccurrence,)	-0.535476
2	1	-0.570800	(fieldMatch(abstract).absoluteProximity,)	-0.535476
3	1	-0.536799	(fieldMatch(abstract).completeness,)	-0.535476
4	1	-0.571269	(fieldMatch(abstract).degradedMatches,)	-0.535476
...	...	...	...	...
158	1	-0.571269	(nativeRank(abstract_t5),)	-0.535476
159	1	-0.571269	(nativeRank(title),)	-0.535476
160	1	-0.571269	(rawScore(specter_embedding),)	-0.535476
161	1	-0.571269	(rawScore(abstract_embedding),)	-0.535476
162	1	-0.571269	(rawScore(title_embedding),)	-0.535476

	topic_id	iteration	cord_uid	relevancy	query	query-rewrite	query-vector	question	narrative	fieldMatch(abstract)	...	fieldLength(abstract)	fieldLength(body_text)	fieldLength(title)	freshness(timestamp)	nativeRank(abstract)	nativeRank(abstract_t5)	nativeRank(title)	rawScore(specter_embedding)	rawScore(abstract_embedding)	rawScore(title_embedding)
0	1	0.5	010vptx3	2	coronavirus origin	coronavirus origin origin COVID-19 information...	(0.28812721371650696, 1.558979868888855, 0.481...	what is the origin of COVID-19	seeking range of information about the SARS-Co...	0.111406	...	0	0	0	0	0	0	0	0	0	0
1	1	2.0	p0kv1pht	1	coronavirus origin	coronavirus origin origin COVID-19 information...	(0.28812721371650696, 1.558979868888855, 0.481...	what is the origin of COVID-19	seeking range of information about the SARS-Co...	0.094629	...	0	0	0	0	0	0	0	0	0	0

	relevancy	binary_relevance
0	2	1
1	1	1
2	2	1
3	0	0
4	0	0