Наборы данных выглядят так:
import pandas as pd data = { "Name": ["Fiona", "Gerald", "Hans", "Isabella", "Jacob"], "Age": [20, 34, None, None, 33], "Gender": ["f", "m", "m", "f", "m"], "Job": ["writer", "Programmer", "Programmer", "Programmer", "Teacher"] } df = pd.DataFrame(data) df
Создадим классы предобработки данных для пайплайна:
- Функция удаления имен (NameDropper)
- Введите возраст (AgeImputer)
- Кодировать пол в двоичный код (FeatureEncoder)
- Одно горячее кодирование задания (FeatureEncoder)
from sklearn.base import BaseEstimator, TransformerMixin class NameDropper(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return X.drop(['Name'], axis=1) class AgeImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): imputer = SimpleImputer(strategy="mean") X['Age'] = imputer.fit_transform(X[['Age']]) return X class FeatureEncoder(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): gender_dict = {"m": 0, "f": 1} X['Gender'] = [gender_dict[g] for g in X['Gender']] encoder = OneHotEncoder() matrix = encoder.fit_transform(X[['Job']]).toarray() columns_names = ["Programmer", "Writer", "Cook", "Teacher"] for i in range(len(matrix.T)): X[columns_names[i]] = matrix.T[i] return X.drop(['Job'], axis=1)
Использование конвейера:
from sklearn.pipeline import Pipeline pipe = Pipeline([ ("dropper", NameDropper()), ("imputer", AgeImputer()), ("encoder", FeatureEncoder()) ]) pipe.fit_transform(df)