
Наборы данных выглядят так:
import pandas as pd
data = {
"Name": ["Fiona", "Gerald", "Hans", "Isabella", "Jacob"],
"Age": [20, 34, None, None, 33],
"Gender": ["f", "m", "m", "f", "m"],
"Job": ["writer", "Programmer", "Programmer", "Programmer", "Teacher"]
}
df = pd.DataFrame(data)
df

Создадим классы предобработки данных для пайплайна:
- Функция удаления имен (NameDropper)
- Введите возраст (AgeImputer)
- Кодировать пол в двоичный код (FeatureEncoder)
- Одно горячее кодирование задания (FeatureEncoder)
from sklearn.base import BaseEstimator, TransformerMixin
class NameDropper(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X.drop(['Name'], axis=1)
class AgeImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
imputer = SimpleImputer(strategy="mean")
X['Age'] = imputer.fit_transform(X[['Age']])
return X
class FeatureEncoder(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
gender_dict = {"m": 0, "f": 1}
X['Gender'] = [gender_dict[g] for g in X['Gender']]
encoder = OneHotEncoder()
matrix = encoder.fit_transform(X[['Job']]).toarray()
columns_names = ["Programmer", "Writer", "Cook", "Teacher"]
for i in range(len(matrix.T)):
X[columns_names[i]] = matrix.T[i]
return X.drop(['Job'], axis=1)
Использование конвейера:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
("dropper", NameDropper()),
("imputer", AgeImputer()),
("encoder", FeatureEncoder())
])
pipe.fit_transform(df)
