Sequential feature selection applied to ranking features
- Load data collected from Vespa
- Ranking features available
- Simplify target label
- Define the model used
- Create custom score function
- Data used
- Feature Selector
- Loading the pre-computed results
- Utility functions
- Forward sequential selection
- Forward selection with floating
- Backward sequential selection
- Backward selection with floating
- Summary
The dataset used here were created by collecting ranking features from Vespa associated with the labelled data released by the round 3 of the TREC-CORD competition.
vespa_cord19.head(2)
There are 163 ranking features available.
features = [
x for x in list(vespa_cord19.columns) if x not in [
'topic_id', 'iteration', 'cord_uid', 'relevancy', 'binary_relevance', 'query',
'query-rewrite', 'query-vector', 'question', 'narrative'
]
]
print(len(features))
The original labelled data has three types of label: 0, 1 and 2. To simplify we will consider just two labels here. The document is either relevant (label = 1) or irrelevant (label = 0)
vespa_cord19["binary_relevance"] = vespa_cord19.apply(lambda row: 1 if row["relevancy"] > 0 else 0, axis=1)
vespa_cord19[['relevancy', 'binary_relevance']].head()
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression(penalty='none', fit_intercept=True)
The function needs to have the following format: score_func(y, y_pred, **kwargs)
.
from math import log
from sklearn.metrics import make_scorer
def compute_log_prob(y, y_pred):
return sum([target*log(prob) + (1-target)*log(1-prob) for prob, target in zip(y_pred, y)])
scorer = make_scorer(score_func=compute_log_prob, needs_proba=True)
y = vespa_cord19.binary_relevance
X = vespa_cord19[features]
The code below was used to generate the sequential feature selection results that will be analysed in this report. It takes some hours to run on a machine with 32 CPUs.
import os
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
columns_to_write = [
"avg_score", "ci_bound", "cv_scores", "feature_idx",
"feature_names", "std_dev", "std_err", "forward", "floating"
]
output_file_name = "sequential_feature.csv"
for forward in [True, False]:
for floating in [False, True]:
sfs = None
sfs_df = None
sfs = SFS(
estimator=estimator,
k_features=(1,len(features)),
forward=forward,
floating=floating,
scoring=scorer,
cv=4,
n_jobs=-1,
verbose=1
)
sfs = sfs.fit(X, y)
sfs_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
sfs_df["forward"] = forward
sfs_df["floating"] = floating
if not os.path.isfile(output_file_name):
sfs_df.to_csv(output_file_name, index = False, header = True, columns=columns_to_write)
columns = list(sfs_df.columns)
else:
sfs_df.to_csv(output_file_name, mode = "a", header=False, index = False, columns=columns_to_write)
results.append(sfs)
from pandas import read_csv
data = read_csv("sequential_feature.csv")
data.head(2)
from pandas import DataFrame
def create_dfs(df):
cv_scores, number_features, feature_names = [], [], []
for idx, row in df.iterrows():
scores = [float(x) for x in row["cv_scores"].lstrip("[").rstrip("]").split()]
cv_scores.extend(scores)
number_features.extend([len(row["feature_idx"].lstrip("(").rstrip(")").split(","))] * len(scores))
feature_names.extend([row["feature_names"]] * len(scores))
sequential_selection = DataFrame(
data={
"cv_scores": cv_scores,
"number_features": number_features,
"feature_names": feature_names
}
)
features = sequential_selection[["number_features", "feature_names"]].sort_values('number_features').drop_duplicates(['number_features'])
return sequential_selection, features
def display_feature_names(df, number_features):
return [x.strip().strip("'") for x in df[df.number_features == number_features].iloc[0]["feature_names"].lstrip("(").rstrip(")").split(",")]
forward_df = data[(data["forward"] == True) & (data["floating"] == False)]
forward_sequential_selection, forward_features = create_dfs(forward_df)
import plotly.express as px
fig = px.box(forward_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(forward_features, 7)
forward_floating_df = data[(data["forward"] == True) & (data["floating"] == True)]
forward_floating_sequential_selection, forward_floating_features = create_dfs(forward_floating_df)
import plotly.express as px
fig = px.box(forward_floating_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(forward_floating_features, 7)
backward_df = data[(data["forward"] == False) & (data["floating"] == False)]
backward_sequential_selection, backward_features = create_dfs(backward_df)
import plotly.express as px
fig = px.box(backward_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(backward_features, 7)
backward_floating_df = data[(data["forward"] == False) & (data["floating"] == True)]
backward_floating_sequential_selection, backward_floating_features = create_dfs(backward_floating_df)
import plotly.express as px
fig = px.box(backward_floating_sequential_selection, x = "number_features", y="cv_scores")
fig.show()
display_feature_names(backward_floating_features, 7)
DataFrame(data = {
"forward": display_feature_names(forward_features, 7),
"forward_floating": display_feature_names(forward_floating_features, 7),
"backward": display_feature_names(backward_features, 7),
"backward_floating": display_feature_names(backward_floating_features, 7)
})