DataLoader¶
DataLoader adalah utility di PyTorch untuk memuat data dalam batch, melakukan shuffling, dan parallel loading.
Dataset Class¶
Custom Dataset¶
import torch
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
def __init__(self, X, y):
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.long)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
# Contoh penggunaan
import numpy as np
X = np.random.randn(100, 10)
y = np.random.randint(0, 2, 100)
dataset = CustomDataset(X, y)
print(f"Dataset size: {len(dataset)}")
print(f"Sample: {dataset[0]}")
TensorDataset¶
import torch
from torch.utils.data import TensorDataset, DataLoader
X = torch.randn(100, 10)
y = torch.randint(0, 2, (100,))
dataset = TensorDataset(X, y)
print(f"Dataset size: {len(dataset)}")
DataLoader¶
Basic Usage¶
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=16,
shuffle=True
)
# Iterasi
for batch_X, batch_y in dataloader:
print(f"Batch X shape: {batch_X.shape}")
print(f"Batch y shape: {batch_y.shape}")
break
Parameter Penting¶
from torch.utils.data import DataLoader
dataloader = DataLoader(
dataset,
batch_size=32, # Ukuran batch
shuffle=True, # Acak urutan
num_workers=4, # Parallel loading
drop_last=True, # Buang batch terakhir jika tidak penuh
pin_memory=True # Untuk GPU
)
Split Train/Val/Test¶
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
# Buat dataset
X = torch.randn(1000, 10)
y = torch.randint(0, 3, (1000,))
dataset = TensorDataset(X, y)
# Split
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(
dataset, [train_size, val_size, test_size]
)
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
print(f"Train: {len(train_dataset)}")
print(f"Val: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")
Custom Collate Function¶
import torch
from torch.utils.data import DataLoader
def custom_collate(batch):
"""Custom function untuk menggabungkan samples menjadi batch."""
X = torch.stack([item[0] for item in batch])
y = torch.tensor([item[1] for item in batch])
return X, y
dataloader = DataLoader(
dataset,
batch_size=16,
collate_fn=custom_collate
)
Dataset dengan Transform¶
import torch
from torch.utils.data import Dataset
import numpy as np
class TransformDataset(Dataset):
def __init__(self, X, y, transform=None):
self.X = X
self.y = y
self.transform = transform
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
x = self.X[idx]
y = self.y[idx]
if self.transform:
x = self.transform(x)
return x, y
# Transform function
def normalize(x):
return (x - x.mean()) / x.std()
dataset = TransformDataset(
np.random.randn(100, 10),
np.random.randint(0, 2, 100),
transform=normalize
)
Dataset untuk Gambar (dengan torchvision)¶
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# Transform
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
# MNIST dataset
train_dataset = datasets.MNIST(
root='./data',
train=True,
download=True,
transform=transform
)
test_dataset = datasets.MNIST(
root='./data',
train=False,
download=True,
transform=transform
)
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# Cek satu batch
images, labels = next(iter(train_loader))
print(f"Images shape: {images.shape}") # torch.Size([64, 1, 28, 28])
print(f"Labels shape: {labels.shape}") # torch.Size([64])
Iterasi dengan Progress¶
from torch.utils.data import DataLoader
from tqdm import tqdm
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)
for epoch in range(3):
for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
# Training step
pass
Weighted Sampling¶
Untuk dataset yang tidak seimbang:
import torch
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
# Dataset tidak seimbang
X = torch.randn(1000, 10)
y = torch.tensor([0]*900 + [1]*100) # 90% class 0, 10% class 1
dataset = TensorDataset(X, y)
# Hitung weights
class_counts = torch.bincount(y)
class_weights = 1.0 / class_counts.float()
sample_weights = class_weights[y]
# Sampler
sampler = WeightedRandomSampler(
weights=sample_weights,
num_samples=len(dataset),
replacement=True
)
# DataLoader dengan sampler
dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)
# Cek distribusi dalam batch
batch_y = next(iter(dataloader))[1]
print(f"Class distribution in batch: {torch.bincount(batch_y)}")
Multiple Workers¶
import torch
from torch.utils.data import DataLoader
# Untuk parallel loading
dataloader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=4, # 4 worker processes
pin_memory=True, # Lebih cepat untuk GPU
persistent_workers=True # Keep workers alive
)
Peringatan
Di Windows, pastikan kode DataLoader ada di dalam if __name__ == '__main__': untuk menghindari error multiprocessing.
Contoh Lengkap¶
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
class SimpleDataset(Dataset):
def __init__(self, size=1000):
self.X = torch.randn(size, 10)
self.y = (self.X.sum(dim=1) > 0).long()
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
# Buat dataset
dataset = SimpleDataset(1000)
# Split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# Simple training loop
model = nn.Linear(10, 2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(5):
model.train()
total_loss = 0
for X, y in train_loader:
optimizer.zero_grad()
output = model(X)
loss = criterion(output, y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
Latihan¶
Buat custom Dataset untuk membaca data dari file CSV
Implementasikan data augmentation dalam Dataset
Buat DataLoader dengan WeightedRandomSampler
Bandingkan kecepatan loading dengan berbagai num_workers