|
import numpy as np |
|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
|
|
def load_soil_data(file_path, target_columns): |
|
""" |
|
参数: |
|
- file_path: 包含土壤数据的文件路径(假设为CSV格式) |
|
- target_columns: 列表,包含8个目标土壤指标的列名 |
|
|
|
返回: |
|
- X_train, X_test, y_train, y_test: 训练和测试集的特征和目标值,划分为8:2 |
|
- wavelengths: 波长信息数组 |
|
""" |
|
|
|
data = pd.read_csv(file_path) |
|
|
|
|
|
wavelengths = data.columns[:4200].str.replace('spc.', '').astype(float) |
|
|
|
|
|
X = data.iloc[:, :4200].values |
|
y = data[target_columns].values |
|
|
|
|
|
X = X.astype('float32') |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
X_train = X_train.reshape(-1, 1, 4200) |
|
X_test = X_test.reshape(-1, 1, 4200) |
|
|
|
return X_train, X_test, y_train, y_test, wavelengths |
|
|
|
if __name__ == "__main__": |
|
|
|
file_path = 'LUCAS.2009_abs.csv' |
|
target_columns = ['pH.in.CaCl2', 'pH.in.H2O', 'OC', 'CaCO3', 'N', 'P', 'K', 'CEC'] |
|
X_train, X_test, y_train, y_test, wavelengths = load_soil_data(file_path, target_columns) |
|
|
|
print("X_train shape:", X_train.shape) |
|
print("X_test shape:", X_test.shape) |
|
print("y_train shape:", y_train.shape) |
|
print("y_test shape:", y_test.shape) |
|
print("wavelengths shape:", wavelengths.shape) |
|
|