Model Trainer Issue on End-to-End ML Project – ValueError: at least one array or dtype is required

I am following the process shown on Wine Quality Prediction End-to-End ML Project on Krish Naik’s YouTube channel to do a Flight Fare Prediction Project.

I run this cell of model trainer pipeline on 04_model_trainer.ipynb:

try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(model_trainer_config)
    # model_trainer_config.train()
    model_trainer_config.initiate_model_training()
except Exception as e:
    raise e

I get this error:

ValueError: at least one array or dtype is required

Here is the full traceback:

[2023-12-19 14:59:14,228: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-19 14:59:14,235: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-19 14:59:14,235: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-19 14:59:14,235: INFO: common: created directory at: artifacts]
[2023-12-19 14:59:14,235: INFO: common: created directory at: artifacts/model_trainer]
[2023-12-19 14:59:14,467: INFO: 378519018: Splitting ]
[2023-12-19 14:59:14,468: INFO: 378519018: Exception occured during model training]
[2023-12-19 14:59:14,468: INFO: 378519018: Exception occured at model trianing]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[17], line 8
      6     model_trainer_config.initiate_model_training()
      7 except Exception as e:
----> 8     raise e

Cell In[17], line 6
      4     model_trainer_config = ModelTrainer(model_trainer_config)
      5     # model_trainer_config.train()
----> 6     model_trainer_config.initiate_model_training()
      7 except Exception as e:
      8     raise e

Cell In[16], line 130, in ModelTrainer.initiate_model_training(self)
    128 except Exception as e:
    129     logger.info('Exception occured at model trianing')
--> 130     raise e

Cell In[16], line 104, in ModelTrainer.initiate_model_training(self)
     89 logger.info('Splitting ')
     91 models={
     92 'LinearRegression':LinearRegression(),
     93 'Lasso':Lasso(),
   (...)
    101 "KNN" : KNeighborsRegressor()
    102 }
--> 104 model_report:dict = ModelTrainer.evaluate_model(X_train,y_train, X_test, y_test, models)
    105 print(model_report)
    106 print("\n====================================================================================")

Cell In[16], line 73, in ModelTrainer.evaluate_model(X_train, y_train, X_test, y_test, models)
     71 except Exception as e:
     72     logger.info('Exception occured during model training')
---> 73     raise e

Cell In[16], line 59, in ModelTrainer.evaluate_model(X_train, y_train, X_test, y_test, models)
     56 model = list(models.values())[i]
     58 # Train model
---> 59 model.fit(X_train,y_train)
     61 # Predict Testing data
     62 y_test_pred = model.predict(X_test)

File c:\Users\2021\.conda\envs\flightfareprediction\lib\site-packages\sklearn\base.py:1152, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1145     estimator._validate_params()
   1147 with config_context(
   1148     skip_parameter_validation=(
   1149         prefer_skip_nested_validation or global_skip_validation
   1150     )
   1151 ):
-> 1152     return fit_method(estimator, *args, **kwargs)

File c:\Users\2021\.conda\envs\flightfareprediction\lib\site-packages\sklearn\linear_model\_base.py:678, in LinearRegression.fit(self, X, y, sample_weight)
    674 n_jobs_ = self.n_jobs
    676 accept_sparse = False if self.positive else ["csr", "csc", "coo"]
--> 678 X, y = self._validate_data(
    679     X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True
    680 )
    682 has_sw = sample_weight is not None
    683 if has_sw:

File c:\Users\2021\.conda\envs\flightfareprediction\lib\site-packages\sklearn\base.py:622, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    620         y = check_array(y, input_name="y", **check_y_params)
    621     else:
--> 622         X, y = check_X_y(X, y, **check_params)
    623     out = X, y
    625 if not no_val_X and check_params.get("ensure_2d", True):

File c:\Users\2021\.conda\envs\flightfareprediction\lib\site-packages\sklearn\utils\validation.py:1146, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1141         estimator_name = _check_estimator_name(estimator)
   1142     raise ValueError(
   1143         f"{estimator_name} requires y to be passed, but the target y is None"
   1144     )
-> 1146 X = check_array(
   1147     X,
   1148     accept_sparse=accept_sparse,
   1149     accept_large_sparse=accept_large_sparse,
   1150     dtype=dtype,
   1151     order=order,
   1152     copy=copy,
   1153     force_all_finite=force_all_finite,
   1154     ensure_2d=ensure_2d,
   1155     allow_nd=allow_nd,
   1156     ensure_min_samples=ensure_min_samples,
   1157     ensure_min_features=ensure_min_features,
   1158     estimator=estimator,
   1159     input_name="X",
   1160 )
   1162 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1164 check_consistent_length(X, y)

File c:\Users\2021\.conda\envs\flightfareprediction\lib\site-packages\sklearn\utils\validation.py:795, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    791 pandas_requires_conversion = any(
    792     _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
    793 )
    794 if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
--> 795     dtype_orig = np.result_type(*dtypes_orig)
    796 elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
    797     # Force object if any of the dtypes is an object
    798     dtype_orig = object

File <__array_function__ internals>:200, in result_type(*args, **kwargs)

ValueError: at least one array or dtype is required

Here is the code of ModelTrainer class:

class ModelTrainer:

    def __init__(self, model_trainer_config):
        self.model_trainer_config = model_trainer_config  
    # def __init__(self):
        # self.model_trainer_config = ModelTrainerConfig()    


    def save_obj(file_path, obj):
        try:

            dir_path = os.path.dirname(file_path)
            os.makedirs(dir_path, exist_ok=True)

            with open(file_path, 'wb') as file_obj:
                joblib.dump(obj, file_obj, compress= ('gzip'))

        except Exception as e:
            logger.info('Error occured in utils save_obj')
            raise e
        

    def evaluate_model(X_train, y_train, X_test, y_test, models):

        try:
            report = {}
            for i in range(len(models)):

                model = list(models.values())[i]

                # Train model
                model.fit(X_train,y_train)

                # Predict Testing data
                y_test_pred = model.predict(X_test)

                # Get R2 scores for train and test data
                test_model_score = r2_score(y_test,y_test_pred)

                report[list(models.keys())[i]] =  test_model_score

            return report

        except Exception as e:
            logger.info('Exception occured during model training')
            raise e    



    def initiate_model_training(self): # removing the required variables to be passed into the function because those variables are created below (assuming they were correctly generated in train() )
        # lines below taken from your commented out train() function
        train_data = pd.read_csv(self.model_trainer_config.train_data_path)
        test_data = pd.read_csv(self.model_trainer_config.test_data_path)

        X_train = train_data.drop([self.model_trainer_config.target_column], axis=1)
        X_test = test_data.drop([self.model_trainer_config.target_column], axis=1)
        y_train = train_data[[self.model_trainer_config.target_column]]
        y_test = test_data[[self.model_trainer_config.target_column]]
        # lines above taken from your commented out train() function

        try:
            logger.info('Splitting ')

            models={
            'LinearRegression':LinearRegression(),
            'Lasso':Lasso(),
            'Ridge':Ridge(),
            'Elasticnet':ElasticNet(),
            'RandomForestRegressor': RandomForestRegressor(),
            'GradientBoostRegressor()' : GradientBoostingRegressor(),
            "AdaBoost" : AdaBoostRegressor(),
            'DecisionTreeRegressor' : DecisionTreeRegressor(),
            "SupportVectorRegressor" : SVR(),
            "KNN" : KNeighborsRegressor()
            }

            model_report:dict = ModelTrainer.evaluate_model(X_train,y_train, X_test, y_test, models)
            print(model_report)
            print("\n====================================================================================")
            logger.info(f'Model Report : {model_report}')

            # to get best model score from dictionary
            best_model_score = max(sorted(model_report.values()))

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]

            best_model = models[best_model_name]

            print(f"Best Model Found, Model Name :{best_model_name}, R2-score: {best_model_score}")
            print("\n====================================================================================")
            logger.info(f"Best Model Found, Model name: {best_model_name}, R2-score: {best_model_score}")
            logger.info(f"{best_model.feature_names_in_}")
            
            ModelTrainer.save_obj(
            file_path = self.model_trainer_config.trained_model_file_path,
            obj = best_model
            )

        except Exception as e:
            logger.info('Exception occured at model trianing')
            raise e

Here is the code of ConfigurationManager class:

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.ElasticNet
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            # # New Line Added
            # trained_model_file_path =  os.path.join('artifact', 'model'),
            # # New Line Ended
            model_name = config.model_name,
            alpha = params.alpha,
            l1_ratio = params.l1_ratio,
            target_column = schema.name
            
        )

        return model_trainer_config

Here is the code of ModelTrainerConfig class:

from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    alpha: float
    l1_ratio: float
    target_column: int

Here is my file in GitHub.

My file encoding is UTF-8

How to fix this issue?

  • 2

    there are undefined variables in the example code. the question needs sufficient code for a minimal reproducible example

    – 

  • @D.L I have updated the question with sufficient code.

    – 

  • Are X_train and y_train actual values? stackoverflow.com/questions/59403627/….

    – 

  • @AndrewRyan Yes, I think so.

    – 

Leave a Comment