DAY 36 复习日

仔细回顾一下神经网络到目前的内容,没跟上进度的同学补一下进度。

  • **作业:**对之前的信贷项目,利用神经网络训练下,尝试用到目前的知识点让代码更加规范和美观。

在这里我对之前的迁移学习项目进行了复现

复制代码
#smote改进

import glob
import os.path
import random
import numpy as np
import tensorflow as tf
from tensorflow.io.gfile import GFile
# 新增SMOTE所需库
from imblearn.over_sampling import SMOTE
from collections import defaultdict

# -------------------------- 常量定义 --------------------------
BOTTLENECK_TENSOR_SIZE = 2048
BOTTLENECK_TENSOR_NAME = 'pool_3/_reshape:0'
JPEG_DATA_TENSOR_NAME = 'DecodeJpeg/contents:0'

MODEL_DIR = r"E:\项目学习内容\迁移学习\time1数据集\inception-2015-12-05 (1)\\"
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_FILE = 'tensorflow_inception_graph.pb'

CACHE_DIR = './images/tmp/bottleneck/'
os.makedirs(CACHE_DIR, exist_ok=True)

INPUT_DATA = r"E:\项目学习内容\迁移学习\time1数据集\time11\GC10-DET\data"
VALIDATION_PERCENTAGE = 10
TEST_PERCENTAGE = 10

LEARNING_RATE = 0.01
STEPS = 4000
BATCH = 100

# -------------------------- 数据预处理函数 --------------------------
def create_image_lists(testing_percentage, validation_percentage):
    result = {}
    sub_dirs = [x[0] for x in os.walk(INPUT_DATA)]
    is_root_dir = True
    for sub_dir in sub_dirs:
        if is_root_dir:
            is_root_dir = False
            continue

        extensions = ['jpg', 'jpeg', 'JPG', 'JPEG']
        file_list = []
        dir_name = os.path.basename(sub_dir)
        for extension in extensions:
            file_glob = os.path.join(INPUT_DATA, dir_name, f'*.{extension}')
            file_list.extend(glob.glob(file_glob))
        if not file_list:
            continue

        label_name = dir_name.lower()
        training_images = []
        testing_images = []
        validation_images = []
        for file_name in file_list:
            base_name = os.path.basename(file_name)
            chance = np.random.randint(100)
            if chance < validation_percentage:
                validation_images.append(base_name)
            elif chance < (testing_percentage + validation_percentage):
                testing_images.append(base_name)
            else:
                training_images.append(base_name)

        result[label_name] = {
            'dir': dir_name,
            'training': training_images,
            'testing': testing_images,
            'validation': validation_images
        }
    return result


def get_image_path(image_lists, image_dir, label_name, index, category):
    label_lists = image_lists[label_name]
    category_list = label_lists[category]
    mod_index = index % len(category_list)
    base_name = category_list[mod_index]
    sub_dir = label_lists['dir']
    return os.path.join(image_dir, sub_dir, base_name)


def get_bottleneck_path(image_lists, label_name, index, category):
    return get_image_path(image_lists, CACHE_DIR, label_name, index, category) + '.txt'


# -------------------------- 特征提取函数 --------------------------
def run_bottleneck_on_image(sess, image_data, image_data_tensor, bottleneck_tensor):
    bottleneck_values = sess.run(bottleneck_tensor, feed_dict={image_data_tensor: image_data})
    return np.squeeze(bottleneck_values)


def get_or_create_bottleneck(sess, image_lists, label_name, index, category, jpeg_data_tensor, bottleneck_tensor):
    label_lists = image_lists[label_name]
    sub_dir = label_lists['dir']
    sub_dir_path = os.path.join(CACHE_DIR, sub_dir)
    os.makedirs(sub_dir_path, exist_ok=True)

    bottleneck_path = get_bottleneck_path(image_lists, label_name, index, category)
    if os.path.exists(bottleneck_path):
        with open(bottleneck_path, 'r') as f:
            bottleneck_string = f.read()
        return [float(x) for x in bottleneck_string.split(',')]

    image_path = get_image_path(image_lists, INPUT_DATA, label_name, index, category)
    with GFile(image_path, 'rb') as f:
        image_data = f.read()
    bottleneck_values = run_bottleneck_on_image(sess, image_data, jpeg_data_tensor, bottleneck_tensor)
    bottleneck_string = ','.join(str(x) for x in bottleneck_values)
    with open(bottleneck_path, 'w') as f:
        f.write(bottleneck_string)

    return bottleneck_values


# -------------------------- 训练数据准备(核心修改:SMOTE过采样) --------------------------
def prepare_smote_training_data(sess, image_lists, n_classes, jpeg_data_tensor, bottleneck_tensor):
    """提前收集所有训练集特征和标签,用SMOTE生成平衡样本"""
    all_bottlenecks = []
    all_labels = []
    label_name_list = list(image_lists.keys())
    
    # 收集原始训练数据
    for label_index, label_name in enumerate(label_name_list):
        for index in range(len(image_lists[label_name]['training'])):
            bottleneck = get_or_create_bottleneck(
                sess, image_lists, label_name, index, 'training', jpeg_data_tensor, bottleneck_tensor
            )
            all_bottlenecks.append(bottleneck)
            label = np.zeros(n_classes, dtype=np.float32)
            label[label_index] = 1.0
            all_labels.append(label)
    
    # 转换为numpy数组
    X = np.array(all_bottlenecks)
    y = np.array(all_labels).argmax(axis=1)  # SMOTE需要整数标签
    
    # 应用SMOTE过采样(关键步骤)
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # 转回独热编码
    y_resampled_onehot = np.zeros((len(y_resampled), n_classes), dtype=np.float32)
    y_resampled_onehot[np.arange(len(y_resampled)), y_resampled] = 1.0
    
    return X_resampled, y_resampled_onehot


def get_smote_batch(smote_features, smote_labels, batch_size):
    """从SMOTE处理后的平衡数据中随机取批次"""
    indices = np.random.choice(len(smote_features), batch_size, replace=False)
    return smote_features[indices], smote_labels[indices]


# -------------------------- 测试数据准备 --------------------------
def get_test_bottlenecks(sess, image_lists, n_classes, jpeg_data_tensor, bottleneck_tensor):
    bottlenecks = []
    ground_truths = []
    label_name_list = list(image_lists.keys())
    for label_index, label_name in enumerate(label_name_list):
        category = 'testing'
        for index in range(len(image_lists[label_name][category])):
            bottleneck = get_or_create_bottleneck(
                sess, image_lists, label_name, index, category, jpeg_data_tensor, bottleneck_tensor
            )
            ground_truth = np.zeros(n_classes, dtype=np.float32)
            ground_truth[label_index] = 1.0
            bottlenecks.append(bottleneck)
            ground_truths.append(ground_truth)
    return np.array(bottlenecks), np.array(ground_truths)


# -------------------------- 主函数 --------------------------
def main(_):
    image_lists = create_image_lists(TEST_PERCENTAGE, VALIDATION_PERCENTAGE)
    n_classes = len(image_lists.keys())
    if n_classes == 0:
        raise ValueError(f"目标域数据集路径 {INPUT_DATA} 下无类别文件夹,请检查!")

    model_path = os.path.join(MODEL_DIR, MODEL_FILE)
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"源域模型 {model_path} 不存在,请下载并放置到该路径!")

    with GFile(model_path, 'rb') as f:
        graph_def = tf.compat.v1.GraphDef()
        graph_def.ParseFromString(f.read())

    with tf.compat.v1.Session() as sess:
        bottleneck_tensor, jpeg_data_tensor = tf.compat.v1.import_graph_def(
            graph_def, return_elements=[BOTTLENECK_TENSOR_NAME, JPEG_DATA_TENSOR_NAME]
        )

        # 定义模型结构
        bottleneck_input = tf.compat.v1.placeholder(tf.float32, [None, BOTTLENECK_TENSOR_SIZE], name='BottleneckInputPlaceholder')
        ground_truth_input = tf.compat.v1.placeholder(tf.float32, [None, n_classes], name='GroundTruthInput')

        with tf.compat.v1.name_scope('final_training_ops'):
            weights = tf.compat.v1.Variable(tf.random.truncated_normal([BOTTLENECK_TENSOR_SIZE, n_classes], stddev=0.001))
            biases = tf.compat.v1.Variable(tf.zeros([n_classes]))
            logits = tf.matmul(bottleneck_input, weights) + biases
            final_tensor = tf.nn.softmax(logits)

        # 定义损失和优化器
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=ground_truth_input)
        cross_entropy_mean = tf.reduce_mean(cross_entropy)
        train_step = tf.compat.v1.train.GradientDescentOptimizer(LEARNING_RATE).minimize(cross_entropy_mean)

        # 定义评估指标
        with tf.compat.v1.name_scope('evaluation'):
            correct_prediction = tf.equal(tf.argmax(final_tensor, 1), tf.argmax(ground_truth_input, 1))
            evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        sess.run(tf.compat.v1.global_variables_initializer())

        # 提前生成SMOTE平衡数据(关键修改)
        smote_features, smote_labels = prepare_smote_training_data(
            sess, image_lists, n_classes, jpeg_data_tensor, bottleneck_tensor
        )

        # 训练过程
        for i in range(STEPS):
            # 使用SMOTE平衡数据训练(关键修改)
            train_bottlenecks, train_ground_truth = get_smote_batch(smote_features, smote_labels, BATCH)
            sess.run(train_step, feed_dict={
                bottleneck_input: train_bottlenecks,
                ground_truth_input: train_ground_truth
            })

            if i % 100 == 0 or i + 1 == STEPS:
                # 验证集仍用原始分布(避免数据泄露)
                validation_bottlenecks = []
                validation_ground_truth = []
                label_name_list = list(image_lists.keys())
                for _ in range(BATCH):
                    label_index = random.randrange(n_classes)
                    label_name = label_name_list[label_index]
                    image_index = random.randrange(65536)
                    bottleneck = get_or_create_bottleneck(
                        sess, image_lists, label_name, image_index, 'validation', jpeg_data_tensor, bottleneck_tensor
                    )
                    ground_truth = np.zeros(n_classes, dtype=np.float32)
                    ground_truth[label_index] = 1.0
                    validation_bottlenecks.append(bottleneck)
                    validation_ground_truth.append(ground_truth)
                validation_acc = sess.run(evaluation_step, feed_dict={
                    bottleneck_input: validation_bottlenecks,
                    ground_truth_input: validation_ground_truth
                })
                print(f"Step {i:4d}: 验证集准确率 = {validation_acc * 100:.1f}%")

        # 测试
        test_bottlenecks, test_ground_truth = get_test_bottlenecks(
            sess, image_lists, n_classes, jpeg_data_tensor, bottleneck_tensor
        )
        test_acc = sess.run(evaluation_step, feed_dict={
            bottleneck_input: test_bottlenecks,
            ground_truth_input: test_ground_truth
        })
        print(f"\n最终测试集准确率 = {test_acc * 100:.1f}%")


if __name__ == '__main__':
    tf.compat.v1.app.run(main=main)

@浙大疏锦行

相关推荐
喜欢打篮球的普通人3 天前
MLIR快速入门
neo4j·mlir
ELI_He9993 天前
Neo4j 安装 APOC
neo4j
綮地3 天前
Neo4j 基本处理
neo4j
lzp07913 天前
Neo4j图数据库学习(二)——SpringBoot整合Neo4j
数据库·学习·neo4j
爱折腾的小码农4 天前
neo4j数据库桌面管理工具
数据库·neo4j
Wenhao.8 天前
Docker 安装 neo4j
docker·容器·neo4j
RDCJM9 天前
Neo4j图数据库学习(二)——SpringBoot整合Neo4j
数据库·学习·neo4j
机器不学习我也不学习10 天前
TensorFlow环境安装
neo4j
码农老李12 天前
vxWorks7.0 Simpc运行tensorflow lite example
人工智能·tensorflow·neo4j
小鸡吃米…1 个月前
TensorFlow 实现异或(XOR)运算
人工智能·python·tensorflow·neo4j