-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
252 lines (206 loc) · 7.6 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""
Data tools. Includes both a keras Sequence
Author: Simon Thomas
Date: Jul-06-2020
"""
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from skimage.transform import resize
from skimage import io
AUTOTUNE = tf.data.experimental.AUTOTUNE
def create_data_set(data_directory=None, file_names=None, img_dim=4, batch_size=12):
"""
Creates a data set using the files in the data directory. If
file_names is specified, only these file names will be used
in the data set.
:param data_directory: the path to the data directory.
:param file_names: a list of full path file names
:param img_dim: the size of the image
:param batch_size: the batch size. Can be updated as ds = ds.batch(new_batch_size)
:return: data set (ds)
:return: n
"""
if not file_names:
file_names = [os.path.join(data_directory, file) for file in os.listdir(data_directory)]
def parse_image(filename):
"""
Reads the image and returns the image, noise and constant tensors
:param filename: the image to load
:return: style batch - ( image, noise_img, constant)
"""
image = tf.io.read_file(filename)
image = tf.image.decode_jpeg(image)
image = tf.image.convert_image_dtype(image, tf.float32)
image = tf.image.resize(image, [img_dim, img_dim])
noise_image = tf.random.normal([img_dim, img_dim], mean=0, stddev=1)
constant = tf.ones([1])
return image, noise_image, constant
n = len(file_names)
ds = tf.data.Dataset.from_tensor_slices(file_names)
ds = ds.shuffle(buffer_size=n, seed=1234, reshuffle_each_iteration=True)
ds = ds.map(parse_image, num_parallel_calls=AUTOTUNE)
ds = ds.batch(batch_size, drop_remainder=True)
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
def get_test_batch(data_set):
"""
Returns the first batch from the data set
:param data_set: data_set object
:return: batch of tensors
"""
for i, batch in enumerate(data_set):
if i == 0:
break
return batch
class DataGenerator(Sequence):
"""
A data generator that inherits the keras.utils.Sequence
class so that it can be used with multi-processing
"""
def __init__(self, directory, batch_size, img_dim=None, shuffle=True,
style=False, target_dim=None):
"""
Inputs:
directory - the directory where the images are
batch_size - the batch size
img_dim - the size of the image if wanting to resize
shuffle - whether to shuffle the images
style - whether to include styleVAE noise as part of X
target_dim - the target size in progressive growing eg. 4->8->...256, target is 256
Outputs:
if style:
batch of images (batch_size, img_dim, img_dim, 3),
noise (batch_size, target_dim, target_dim, 1),
constant (batch_size, 1)
else:
batch of images, batch of images
"""
self.dir = directory
self.batch_size = batch_size
self.img_dim = img_dim
self.shuffle = shuffle
self.style = style
self.target_dim = target_dim
# ---------------------------------------- #
self.files = np.array(os.listdir(self.dir))
self.n = len(self.files)
self.indices = np.arange(self.n)
if self.shuffle:
np.random.seed(0)
np.random.shuffle(self.indices)
self.files_in_batch = []
self.style = style
self.pos = 0
def __len__(self):
"""Denotes the number of batches per epoch"""
return self.n // self.batch_size
def _indices_at(self, batch_index):
"""Returns the list of indices for batch index"""
return self.indices[batch_index * self.batch_size:(batch_index + 1) * self.batch_size]
def __getitem__(self, index):
"""Generate one batch of data"""
# Generate indices of the batch
indices = self._indices_at(index)
# Get list of files in batch
self.files_in_batch = self.files[indices]
batch = []
self.files_in_batch = []
for i in indices:
# Create filename
fname = os.path.join(self.dir, self.files[i])
# Load image & scale between 0-1
img = io.imread(fname)[:, :, 0:3] / 255.
# resize
if img.shape[0] != self.img_dim:
img = resize(img, (self.img_dim, self.img_dim))
batch.append(img)
batch = np.stack(batch)
if self.style:
X = [
# Input Image
batch,
# Noise Image
np.random.normal(0, 1, (self.batch_size, self.target_dim, self.target_dim, 1)),
# Constant input
np.ones((self.batch_size, 1, 1))
]
return X
# Standard
else:
return batch, batch
def set_files(self, files):
self.files = files
self.n = len(self.files)
self.shuffle = True
self.indices = np.arange(self.n)
print("setting files and resetting generator.")
def __next__(self):
if self.pos >= (self.n // self.batch_size):
self.pos = -1
result = self[self.pos]
self.pos += 1
return result
def create_patch_generator(filename, dim, scale=True):
"""
Creates a multi-patch generator from a whole image. This makes
processing whole images much more efficient.
Note: It does NOT use overlapping patches, but rather pads the image so
that even size patches can be sampled.
"""
# Load Image
print("reading in", filename, "...")
image = io.imread(filename) / 255.
# Scale to 2x reduction
if scale:
image = resize(image, (image.shape[0] // 2, image.shape[1] // 2))
# Pad Image
row_add = dim - (image.shape[0] % dim)
col_add = dim - (image.shape[1] % dim)
image_padded = np.pad(image, ([0, row_add], [0, col_add], [0, 0]), constant_values=[1])
print("Converting to tensorflow dataset")
# Convert to tensor
image_padded_tensor = tf.convert_to_tensor(image_padded)
def get_patch(index):
r = index[0]
c = index[1]
return image_padded_tensor[r * dim:r * dim + dim, c * dim:c * dim + dim]
# Get indices
n_rows = image_padded.shape[0] // dim
n_cols = image_padded.shape[1] // dim
indices = []
for r in range(n_rows):
for c in range(n_cols):
indices.append([r, c])
indices = np.array(indices)
# Create dataset
ds = tf.data.Dataset.from_tensor_slices((indices))
ds = ds.map(get_patch)
ds = ds.batch(12)
ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return ds, (n_rows, n_cols)
if __name__ == "__main__":
DATA_DIR = "/home/simon/PycharmProjects/StyleALAE/data/celeba-128"
BATCH_SIZE = 12
DIM = 4
data_gen = DataGenerator(directory=DATA_DIR,
batch_size=BATCH_SIZE,
img_dim=DIM,
style=True,
target_dim=DIM)
x, noise, constant = next(data_gen)
files = [os.path.join(DATA_DIR, file) for file in os.listdir(DATA_DIR)]
ds = create_data_set(file_names=files, img_dim=4, batch_size=128)
print(ds)
#N = sum(1 for _ in ds)
for i, batch in enumerate(ds):
if i == 0:
break
print(batch)
# print("one")
# for batch in ds:
# print(type(batch))
# print("two")
# for batch in ds:
# print(type(batch))