import warnings
warnings.filterwarnings('ignore')

!pip install -q seaborn visualkeras
import visualkeras
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import plotly.graph_objects as go
from PIL import Image
import cv2
import os

import tensorflow as tf
from keras import layers
from keras.layers import Input, Lambda, Dense, Flatten, Dropout, BatchNormalization
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


directory_path = "/kaggle/input/d/asdasdasasdas/garbage-classification/Garbage classification/Garbage classification/trash"
image_files = sorted([file for file in os.listdir(directory_path) if file.lower().endswith(('.jpg', '.jpeg', '.png'))])[:20]
fig, axes = plt.subplots(4, 5, figsize=(15, 12))
for i, image_file in enumerate(image_files):
    img = Image.open(os.path.join(directory_path, image_file))
    ax = axes[i // 5, i % 5]
    ax.imshow(img)
    ax.axis('off')
plt.show()


directory_path = "/kaggle/input/d/asdasdasasdas/garbage-classification/Garbage classification/Garbage classification/glass"
image_files = sorted([file for file in os.listdir(directory_path) if file.lower().endswith(('.jpg', '.jpeg', '.png'))])[:20]
fig, axes = plt.subplots(4, 5, figsize=(15, 12))
for i, image_file in enumerate(image_files):
    img = Image.open(os.path.join(directory_path, image_file))
    ax = axes[i // 5, i % 5]
    ax.imshow(img)
    ax.axis('off')
plt.show()


root='/kaggle/input/garbage-classification/garbage_classification/'
data={}
for i in os.listdir(root):
    for j in os.walk(root+i):
        for k in j[2]:
            data[root+i+'/'+k]=i
data=pd.DataFrame(data.items(),columns=['path','class_'])
data['class_'].value_counts().plot(kind='bar')
plt.title('Data 12 Class (VERY UNBALANCED)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()


data[data['class_'] == 'white-glass']['class_'].value_counts()

class_
white-glass    775
Name: count, dtype: int64


import os
import shutil

original_path = '/kaggle/input/d/asdasdasasdas/garbage-classification/Garbage classification/Garbage classification/'
bonus_data_path = '/kaggle/input/garbage-classification/garbage_classification/'
destination_path = '/kaggle/working/modified_dataset/'

if os.path.exists(destination_path) and os.path.isdir(destination_path):
    shutil.rmtree(destination_path)

os.makedirs(destination_path, exist_ok=True)
classes_to_remove = ['cardboard', 'trash', 'glass'] # in original data
new_classes = ['battery', 'biological', 'white-glass'] # in bonus data
target_image_count = 775

for class_name in os.listdir(original_path):
    class_path = os.path.join(original_path, class_name)

    if class_name in classes_to_remove:
        continue

    new_class_name = class_name
    if class_name in new_classes:
        new_class_name = new_classes[new_classes.index(class_name)]

    new_class_path = os.path.join(destination_path, new_class_name)
    os.makedirs(new_class_path, exist_ok=True)
    files_to_copy = os.listdir(class_path)[:target_image_count]
    for file_name in files_to_copy:
        file_path = os.path.join(class_path, file_name)
        shutil.copy(file_path, new_class_path)
    
    # If the class has fewer than target_image_count images, fill up from bonus data
    if len(os.listdir(new_class_path)) < target_image_count:
        remaining_images = target_image_count - len(os.listdir(new_class_path))
        print(f"{class_name} has {len(os.listdir(new_class_path))} images and missing {remaining_images} to fill {target_image_count}.")
        if class_name == "glass":
            bonus_class_path = os.path.join(bonus_data_path, "white-glass")
        else:
            bonus_class_path = os.path.join(bonus_data_path, class_name)
        if bonus_class_path:
            bonus_files = os.listdir(bonus_class_path)
            copied_names = set(os.listdir(new_class_path))
            for file_name in bonus_files:
                if remaining_images == 0:
                    break
                new_file_name = file_name
                counter = 1
                while new_file_name in copied_names:
                    base_name, extension = os.path.splitext(file_name)
                    new_file_name = f"{class_name}_{counter}{extension}"
                    counter += 1
                file_path = os.path.join(bonus_class_path, file_name)
                new_file_path = os.path.join(new_class_path, new_file_name)
                shutil.copy(file_path, new_file_path)
                copied_names.add(new_file_name)
                remaining_images -= 1

# Process the bonus dataset to copy battery and organic data
for class_name in os.listdir(bonus_data_path):
    class_path = os.path.join(bonus_data_path, class_name)
    if class_name in new_classes:
        new_class_name = class_name
        if class_name in new_classes:
            new_class_name = new_classes[new_classes.index(class_name)]
        if new_class_name == "biological":
            new_class_path = os.path.join(destination_path, "organic")
        elif new_class_name == "white-glass":
            new_class_path = os.path.join(destination_path, "glass")
        else:
            new_class_path = os.path.join(destination_path, new_class_name)
        os.makedirs(new_class_path, exist_ok=True)
        files_to_copy = os.listdir(class_path)[:target_image_count]
        for file_name in files_to_copy:
            file_path = os.path.join(class_path, file_name)
            shutil.copy(file_path, new_class_path)

print("\nFINISH: Dataset modification complete.")
for class_ in os.listdir(destination_path):
    count_class = len(os.listdir(os.path.join(destination_path, class_)))
    print(f"{class_} has {count_class} images.")

metal has 410 images and missing 365 to fill 775.
paper has 594 images and missing 181 to fill 775.
plastic has 482 images and missing 293 to fill 775.

FINISH: Dataset modification complete.
battery has 775 images.
glass has 775 images.
paper has 775 images.
metal has 775 images.
plastic has 775 images.
organic has 775 images.


data_path = '/kaggle/input/garbage-classification-6-classes-775class/'
data_classes = os.listdir(data_path)
data_classes

['metal', 'glass', 'organic', 'paper', 'battery', 'plastic']


for class_ in os.listdir(data_path):
    count_class = 0
    for photo in os.listdir(data_path + class_):
        count_class += 1
    print(str(class_) + " has " + str(count_class) + " images.")

metal has 775 images.
glass has 775 images.
organic has 775 images.
paper has 775 images.
battery has 775 images.
plastic has 775 images.


data = pd.DataFrame()
for class_ in os.listdir(data_path):
    temp = pd.DataFrame()
    temp['path'] = np.nan
    temp['type_trash'] = class_
    i = 0
    for photo in os.listdir(data_path + class_):
        temp.loc[i, 'path'] = data_path + class_ + "/" + photo
        temp.loc[i, 'type_trash'] = class_
        i += 1
    data = pd.concat([data, temp], ignore_index=True)
    del temp
data.shape

(4650, 2)


data = data.sample(frac=1)
data.reset_index(drop=True, inplace=True)
data.head(3)


def display_images_with_labels(df, idx):
        row = df.iloc[idx]
        image_path = row['path']
        image_label = row['type_trash']
        img = mpimg.imread(image_path)
        plt.imshow(img)
        plt.title(f'Label: {image_label}')
        plt.axis('off')
        plt.show()
        
display_images_with_labels(data, 1040)


from keras.applications.vgg16 import preprocess_input

def image_preprocessing(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img


def plot_images(original, preprocessed):
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs[0].imshow(original)
    axs[0].set_title('Original Image')
    axs[0].axis('off')
    axs[1].imshow(np.squeeze(preprocessed, axis=0))
    axs[1].set_title('Preprocessed Image')
    axs[1].axis('off')
    plt.show()

img_path = '/kaggle/input/garbage-classification-6-classes-775class/plastic/plastic125.jpg' 
original_image = Image.open(img_path)
preprocessed_image = image_preprocessing(img_path)
plot_images(original_image, preprocessed_image)


BATCH_SIZE = 64
n_classes = 6

# VGG16 base model
conv_base = VGG16(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3)
)

# Freeze all layers except the last two
for layer in conv_base.layers[:-2]:
    layer.trainable = False

# Fine-tune model
top_model = conv_base.output
top_model = Flatten(name="flatten")(top_model)
top_model = Dense(1024, activation='relu')(top_model)  # Increased units
top_model = BatchNormalization()(top_model)
top_model = Dropout(0.5)(top_model)
top_model = Dense(512, activation='relu')(top_model)  # Increased units
top_model = BatchNormalization()(top_model)
top_model = Dropout(0.5)(top_model)
output_layer = Dense(n_classes, activation='softmax')(top_model)

# Final Model
model = Model(inputs=conv_base.input, outputs=output_layer)
visualkeras.layered_view(model, to_file='model_architecture.png', legend=True)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
58889256/58889256 [==============================] - 0s 0us/step


model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 56, 56, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 28, 28, 256)       0         
                                                                 
 block4_conv1 (Conv2D)       (None, 28, 28, 512)       1180160   
                                                                 
 block4_conv2 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_conv3 (Conv2D)       (None, 28, 28, 512)       2359808   
                                                                 
 block4_pool (MaxPooling2D)  (None, 14, 14, 512)       0         
                                                                 
 block5_conv1 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv2 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_conv3 (Conv2D)       (None, 14, 14, 512)       2359808   
                                                                 
 block5_pool (MaxPooling2D)  (None, 7, 7, 512)         0         
                                                                 
 flatten (Flatten)           (None, 25088)             0         
                                                                 
 dense (Dense)               (None, 1024)              25691136  
                                                                 
 batch_normalization (Batch  (None, 1024)              4096      
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 batch_normalization_1 (Bat  (None, 512)               2048      
 chNormalization)                                                
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 6)                 3078      
                                                                 
=================================================================
Total params: 40939846 (156.17 MB)
Trainable params: 28581894 (109.03 MB)
Non-trainable params: 12357952 (47.14 MB)
_________________________________________________________________


def plot_random_images(generator, num_images=20):
    # Get a batch of data from the generator
    images, labels = next(generator)
    random_indices = np.random.choice(images.shape[0], num_images, replace=False)

    fig, axs = plt.subplots(4, 5, figsize=(15, 12))
    fig.suptitle('Random 20 Images from the Generator', fontsize=16)

    for i, ax in enumerate(axs.flatten()):
        index = random_indices[i]
        image = images[index]
        label = labels[index]
        ax.imshow(image)
        ax.set_title(f'Class: {np.argmax(label)}')
        ax.axis('off')

    plt.show()


gen_train = ImageDataGenerator(preprocessing_function=preprocess_input) # VGG16 preprocessing
full_data = gen_train.flow_from_directory(data_path, target_size = (224, 224), batch_size = BATCH_SIZE, class_mode="categorical")
plot_random_images(full_data)

Found 4650 images belonging to 6 classes.


filenames = full_data.filenames
labels = full_data.labels
class_mapping = {value: str(key) for key, value in full_data.class_indices.items()}
labels = [class_mapping[label] for label in labels]

# Split the data into training and testing sets
train_filenames, test_filenames, train_labels, test_labels = train_test_split(filenames, labels, test_size=0.1, random_state=42)

train_df = pd.DataFrame({'filename': train_filenames, 'class': train_labels})
test_df = pd.DataFrame({'filename': test_filenames, 'class': test_labels})

# Create separate generators for training and testing using flow_from_dataframe
train_data = gen_train.flow_from_dataframe(train_df, directory=data_path, target_size=(224, 224),
                                           batch_size=BATCH_SIZE, class_mode="categorical",
                                           shuffle=True, seed=42)

test_data = gen_train.flow_from_dataframe(test_df, directory=data_path, target_size=(224, 224),
                                          batch_size=BATCH_SIZE, class_mode="categorical",
                                          shuffle=False)

Found 4185 validated image filenames belonging to 6 classes.
Found 465 validated image filenames belonging to 6 classes.


vgg16 = load_model("/kaggle/input/garbage-classifier-model/garbage_classifier.vgg_16.h5")


output_class = ["battery", "glass", "metal","organic", "paper", "plastic"]

def preprocessing_input(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img) # VGG16 preprocess_input
    return img

def plot_images(original, preprocessed):
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))

    axs[0].imshow(original)
    axs[0].set_title('Original Image')
    axs[0].axis('off')

    # Remove the batch dimension for display
    preprocessed = np.squeeze(preprocessed, axis=0)

    axs[1].imshow(preprocessed)
    axs[1].set_title('Preprocessed Image')
    axs[1].axis('off')

    plt.show()

def predict_user(img_path):
    img = preprocessing_input(img_path)
    plot_images(Image.open(img_path), img)
    predicted_array = model.predict(img)
    predicted_value = output_class[np.argmax(predicted_array)]
    predicted_accuracy = round(np.max(predicted_array) * 100, 2)
    print("Your waste material is", predicted_value, "with", predicted_accuracy, "% accuracy.")


predictions = vgg16.predict(test_data)

# Get the predicted class labels
predicted_labels = np.argmax(predictions, axis=1)

# Get the true class labels
true_labels = test_data.classes

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_data.class_indices.keys(),
            yticklabels=test_data.class_indices.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
class_names = list(test_data.class_indices.keys())
print(classification_report(true_labels, predicted_labels, target_names=class_names))

8/8 [==============================] - 20s 751ms/step

              precision    recall  f1-score   support

     battery       0.98      1.00      0.99        87
       glass       0.95      0.86      0.91        73
       metal       0.92      0.88      0.90        69
     organic       0.95      0.98      0.97        64
       paper       1.00      0.98      0.99        92
     plastic       0.85      0.94      0.89        80

    accuracy                           0.94       465
   macro avg       0.94      0.94      0.94       465
weighted avg       0.95      0.94      0.94       465


vgg19 = load_model("/kaggle/input/garbage-classifier-model/garbage_classifier.vgg_19.h5")


predictions = vgg19.predict(test_data)

# Get the predicted class labels
predicted_labels = np.argmax(predictions, axis=1)

# Get the true class labels
true_labels = test_data.classes

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_data.class_indices.keys(),
            yticklabels=test_data.class_indices.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
class_names = list(test_data.class_indices.keys())
print(classification_report(true_labels, predicted_labels, target_names=class_names))

8/8 [==============================] - 2s 286ms/step

              precision    recall  f1-score   support

     battery       0.95      0.99      0.97        87
       glass       0.90      0.90      0.90        73
       metal       0.90      0.88      0.89        69
     organic       0.95      0.97      0.96        64
       paper       0.94      0.98      0.96        92
     plastic       0.94      0.85      0.89        80

    accuracy                           0.93       465
   macro avg       0.93      0.93      0.93       465
weighted avg       0.93      0.93      0.93       465


resnet50 = load_model("/kaggle/input/garbage-classifier-model/garbage_classifier.resnet50.h5")


predictions = resnet50.predict(test_data)

# Get the predicted class labels
predicted_labels = np.argmax(predictions, axis=1)

# Get the true class labels
true_labels = test_data.classes

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_data.class_indices.keys(),
            yticklabels=test_data.class_indices.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
class_names = list(test_data.class_indices.keys())
print(classification_report(true_labels, predicted_labels, target_names=class_names))

8/8 [==============================] - 5s 276ms/step

              precision    recall  f1-score   support

     battery       0.98      0.99      0.98        87
       glass       0.87      0.93      0.90        73
       metal       0.86      0.87      0.86        69
     organic       1.00      0.97      0.98        64
       paper       0.95      0.96      0.95        92
     plastic       0.92      0.85      0.88        80

    accuracy                           0.93       465
   macro avg       0.93      0.93      0.93       465
weighted avg       0.93      0.93      0.93       465


resnet152 = load_model("/kaggle/input/garbage-classifier-model/garbage_classifier.resnet152.h5")


predictions = resnet152.predict(test_data)

# Get the predicted class labels
predicted_labels = np.argmax(predictions, axis=1)

# Get the true class labels
true_labels = test_data.classes

# Compute confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_data.class_indices.keys(),
            yticklabels=test_data.class_indices.keys())
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
class_names = list(test_data.class_indices.keys())
print(classification_report(true_labels, predicted_labels, target_names=class_names))

8/8 [==============================] - 6s 448ms/step

              precision    recall  f1-score   support

     battery       0.97      1.00      0.98        87
       glass       0.88      0.88      0.88        73
       metal       0.86      0.90      0.88        69
     organic       0.98      0.98      0.98        64
       paper       0.97      1.00      0.98        92
     plastic       0.93      0.82      0.87        80

    accuracy                           0.93       465
   macro avg       0.93      0.93      0.93       465
weighted avg       0.93      0.93      0.93       465


models = [vgg16, vgg19, resnet50, resnet152]
model_names = ['VGG16', 'VGG19', 'ResNet50', 'ResNet152']

loss_values = []
accuracy_values = []
# Evaluate and store loss/accuracy for each model
for model, name in zip(models, model_names):
    loss, accuracy = model.evaluate_generator(test_data)
    loss_values.append(loss)
    accuracy_values.append(accuracy)


loss_color = 'lightblue'
accuracy_color = 'mediumseagreen'
fig, ax1 = plt.subplots(figsize=(10, 6))
sns.barplot(x=model_names, y=loss_values, color=loss_color, ax=ax1, label='Loss')
ax2 = ax1.twinx()
sns.lineplot(x=model_names, y=accuracy_values, color=accuracy_color, marker='o', ax=ax2, label='Accuracy')
ax1.set_xlabel('Model')
ax1.set_ylabel('Loss', color='black')
ax2.set_ylabel('Accuracy', color='black')
plt.title('Loss and Accuracy for Models')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')
plt.show()


def get_file_size(file_path):
    if os.path.exists(file_path):
        # Lấy kích thước của file và chuyển đổi thành đơn vị MB
        size_in_bytes = os.path.getsize(file_path)
        size_in_mb = size_in_bytes / (1024 * 1024)  # 1 MB = 1024 KB = 1024 * 1024 bytes
        return size_in_mb


size_vgg16 = get_file_size('/kaggle/input/garbage-classifier-model/garbage_classifier.vgg_16.h5')
size_vgg19 = get_file_size('/kaggle/input/garbage-classifier-model/garbage_classifier.vgg_19.h5')
size_resnet50 = get_file_size('/kaggle/input/garbage-classifier-model/garbage_classifier.resnet50.h5')
size_resnet152 = get_file_size('/kaggle/input/model-resnet/model_resnet152.h5')


# Đường dẫn đến các file .h5
file_paths = [
    '/kaggle/input/garbage-classifier-model/garbage_classifier.vgg_16.h5',
    '/kaggle/input/garbage-classifier-model/garbage_classifier.vgg_19.h5',
    '/kaggle/input/garbage-classifier-model/garbage_classifier.resnet50.h5',
    '/kaggle/input/garbage-classifier-model/garbage_classifier.resnet152.h5'
]

# Kích thước của các file
file_sizes = [get_file_size(file_path) for file_path in file_paths]

# Tên của các mô hình
model_names = ['VGG16', 'VGG19', 'ResNet50', 'ResNet152']
for model, size in zip(model_names, file_sizes):
    print(f"Size (MB) of model {model}: {size:.2f} MB")

Size (MB) of model VGG16: 374.35 MB
Size (MB) of model VGG19: 100.56 MB
Size (MB) of model ResNet50: 97.39 MB
Size (MB) of model ResNet152: 231.06 MB

Deployment Into Production: Machine Learning API on AWS

Garbage Classifiation - Nhóm 8

About Project:¶

Kiến trúc của software:¶

Table of contents:¶

Import library: ¶

Data Collection: ¶

About the dataset ¶

Prepare dataset: ¶

Explore Data Analysis (EDA): ¶

Model: ¶

Data Preprocessing: ¶

Model Architecture: ¶

Train Test Split: ¶

Model Training and Validation ¶

Compare Model: ¶

VGG16: ¶

VGG19: ¶

ResNet50: ¶

ResNet152: ¶

So sánh loss và accuracy trên tập test ở từng model ¶

Chọn lựa model có dung lượng/kích thước phù hợp để serving ¶

Model Serving as API: ¶

Deployment On Amazon Elastic Compute Cloud (AWS EC2) with Github Actions ¶

Tự động hóa quy trình Deploy (CI/CD với Github Actions) ¶

Full Source Code: Website, Model, API ¶

	path	type_trash
0	/kaggle/input/garbage-classification-6-classes...	plastic
1	/kaggle/input/garbage-classification-6-classes...	glass
2	/kaggle/input/garbage-classification-6-classes...	plastic