python h5文件读取_python读取整个txt文件

全栈程序员站长

发布于 2022-10-03 03:53:03

90300

代码可运行

文章被收录于专栏：全栈程序员必看全栈程序员必看

运行总次数：0

代码可运行

大家好，又见面了，我是你们的朋友全栈君。

这篇文章是一个工具类，用来辅助医学图像分割实战 unet实现(二） 4、数据存储这一小节的内容。

2019/5/2 更新：HDF5DatasetWrite可以动态扩展储存大小

文件： HDF5DatasetGenerator.py

# -*- coding: utf-8 -*-
import h5py
import os
import numpy as np
class HDF5DatasetGenerator:
def __init__(self, dbPath, batchSize, preprocessors=None,
aug=None, binarize=True, classes=2):
self.batchSize = batchSize
self.preprocessors = preprocessors
self.aug = aug
self.binarize = binarize
self.classes = classes
self.db = h5py.File(dbPath)
self.numImages = self.db["images"].shape[0]
# self.numImages = total
print("total images:",self.numImages)
self.num_batches_per_epoch = int((self.numImages-1)/batchSize) + 1
def generator(self, shuffle=True, passes=np.inf):
epochs = 0
while epochs < passes:
shuffle_indices = np.arange(self.numImages) 
shuffle_indices = np.random.permutation(shuffle_indices)
for batch_num in range(self.num_batches_per_epoch):
start_index = batch_num * self.batchSize
end_index = min((batch_num + 1) * self.batchSize, self.numImages)
# h5py get item by index,参数为list，而且必须是增序
batch_indices = sorted(list(shuffle_indices[start_index:end_index]))
images = self.db["images"][batch_indices,:,:,:]
labels = self.db["masks"][batch_indices,:,:,:]
# if self.binarize:
# labels = np_utils.to_categorical(labels, self.classes)
if self.preprocessors is not None:
procImages = []
for image in images:
for p in self.preprocessors:
image = p.preprocess(image)
procImages.append(image)
images = np.array(procImages)
if self.aug is not None:
# 不知道意义何在？本身images就有batchsize个了
(images, labels) = next(self.aug.flow(images, labels,
batch_size=self.batchSize))
yield (images, labels)
epochs += 1
def close(self):
self.db.close()

文件： HDF5DatasetWriter.py

# -*- coding: utf-8 -*-
import h5py
import os
class HDF5DatasetWriter:
def __init__(self, image_dims, mask_dims, outputPath, bufSize=200):
""" Args: - bufSize: 当内存储存了bufSize个数据时，就需要flush到外存 """
if os.path.exists(outputPath):
raise ValueError("The supplied 'outputPath' already"
"exists and cannot be overwritten. Manually delete"
"the file before continuing", outputPath)
self.db = h5py.File(outputPath, "w")
self.data = self.db.create_dataset("images", image_dims, maxshape=(None,)+image_dims[1:], dtype="float")
self.masks = self.db.create_dataset("masks", mask_dims, maxshape=(None,)+mask_dims[1:], dtype="int")
self.dims = image_dims
self.bufSize = bufSize
self.buffer = { 
"data": [], "masks": []}
self.idx = 0
def add(self, rows, masks):
# extend() 函数用于在列表末尾一次性追加另一个序列中的多个值（用新列表扩展原来的列表)
# 注意，用extend还有好处，添加的数据不会是之前list的引用！！
self.buffer["data"].extend(rows)
self.buffer["masks"].extend(masks)
print("len ",len(self.buffer["data"]))
if len(self.buffer["data"]) >= self.bufSize:
self.flush()
def flush(self):
i = self.idx + len(self.buffer["data"])
if i>self.data.shape[0]:
# 扩展大小的策略可以自定义
new_shape = (self.data.shape[0]*2,)+self.dims[1:]
print("resize to new_shape:",new_shape)
self.data.resize(new_shape)
self.masks.resize(new_shape)
self.data[self.idx:i,:,:,:] = self.buffer["data"]
self.masks[self.idx:i,:,:,:] = self.buffer["masks"]
print("h5py have writen %d data"%i)
self.idx = i
self.buffer = { 
"data": [], "masks": []}
def close(self):
if len(self.buffer["data"]) > 0:
self.flush()