forked from vickysun5/SmartProcessAnalytics
-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_random_data.py
107 lines (105 loc) · 8.42 KB
/
create_random_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import numpy as np
import pandas as pd
from time import localtime
import pdb
def create_random_data(data_size, mode, data_range = (-10, 10), degrees = [1], seed = 123456789, coefficients = None, noise = (0, 0)):
"""
Creates "data_size" random points with values from data_range[0] to data_range[1], then use them and a functional form ...
(defined by "mode" and "degrees") to generate a y response.
"""
# Creating the X_data for the dataset
rng = np.random.default_rng(seed)
if mode.casefold() in {'multicollinear', 'multicollinearity'}:
X_0 = rng.integers(data_range[0]*10, data_range[1]*10, (data_size[0], 1)) / 10
X_noise_sigma = 0.05 # Edit as needed. New variable so that this can be added to the file_name below
noise_MC = rng.normal(0, X_noise_sigma, X_0.shape)
X_1 = X_0 + noise_MC
X_data = np.concatenate((X_0, X_1), axis = 1)
coefficients = np.array([4, 2])
print(f'The X_1 noise level for the multicollinear model is {np.mean(np.abs(noise_MC/X_0))*100:.1f}%')
else: # General case
X_data = rng.integers(data_range[0]*10, data_range[1]*10, data_size) / 10 # Numbers between data_range[0] and data_range[1] with one decimal place
# Setting up the other parameters for the function
noise = tuple(int(elem) if elem == int(elem) else elem for elem in noise) # Changes integer noise parameters to actual ints for display convenience
noise_vals = rng.normal(noise[0], noise[1], X_data.shape[0]) # Always run to "burn" X_data.shape[0] values from rng to ensure reproducibility between noisy and noiseless runs
if noise[1]:
noise_str = f' + ϵ{noise}'
else:
noise_str = ''
noise_vals = 0
if coefficients is None:
coefficients = rng.integers(-100, 100, X_data.shape[1]) / 10
else:
_ = rng.integers(-100, 100, X_data.shape[1]) / 10 # Ran to "burn" X_data.shape[1] values from the rng to ensure reproducibility
degrees = [int(elem) if elem == int(elem) else elem for elem in degrees] # Changes integer degrees to actual ints for display convenience
powers = rng.choice(degrees, X_data.shape[1])
# Creating the y_data based on the input parameters
if mode.casefold() == 'poly':
is_ln = [False]*X_data.shape[1]
y_data = (coefficients * X_data**powers).sum(axis = 1)
elif mode.casefold() == 'log':
is_ln = [True]*X_data.shape[1]
y_data = (coefficients * np.log(np.abs(X_data))**powers).sum(axis = 1)
elif mode.casefold() in {'stefan-boltzmann', 'stefanboltzmann', 'stefan_boltzmann'}:
mode = 'stefanboltzmann' # For standardization
degrees = [1] # For standardization since we have a fixed eqn
y_data = 3.828e26 * X_data[:, 0]**2 * X_data[:, 1]**4 # L_{Sun} * R^2 * T^4 [In units of R_{Sun} and T_{Sun}]
functional_form_str = f'L = 3.828e26 * R^2 * T^4{noise_str}'
elif mode.casefold() == 'energy': # NOTE: if irrelevant variables are not needed, set data_size to (K, 1), as V always gets added later
degrees = [1] # For standardization since we have a fixed eqn, even though there's a degree 4 interaction with m^2*v^2
V = rng.integers(5e8, 2.5e9, data_size) / 10 # Numbers between 50 000 000 (5e7) and 250 000 000 (2.5e8) with one decimal place
X_data = np.hstack((X_data, V))
c = 299792458 # m / s
y_data = c**2 * X_data[:, 0]**2 * X_data[:, 1]**2 + c**4 * X_data[:, 0]**2 # E^2
functional_form_str = f'E^2 = 8.98755e16*m^2*v^2 + 8.07761e33*m^2{noise_str}'
file_name = f'{mode}_{data_size[0]}x{data_size[1]}-data_{data_range[0]}to{data_range[1]}-range_{seed}-seed_({noise[0]},{noise[1]:.2e})-noise.csv'
print(f'Relativistic effects account for {np.mean(1 - c**4 * X_data[:, 0]**2/y_data)*100:.2f}% of this system\'s energy')
elif mode.casefold() in {'multicollinear', 'multicollinearity'}:
mode = 'multicollinear'
y_data = (coefficients*X_data).sum(axis = 1)
functional_form_str = f'y = 4*X_0 + 2*X_1{noise_str}'
file_name = f'{mode}_{data_size[0]}x2-data_{data_range[0]}to{data_range[1]}-range_{seed}-seed_({noise[0]},{noise[1]})-ynoise_(0,{X_noise_sigma})-Xnoise.csv'
else: # TODO: mixture of log and linear, maybe other functional forms
raise NotImplementedError
# Saving the data using SPA's preferred format
y_data_noisy = y_data + noise_vals
all_data = np.column_stack((X_data, y_data_noisy))
degree_str = str(degrees)[1:-1]
coeff_str = str(coefficients)[1:-1].replace(",", "")
if 'file_name' not in locals(): # Specific equations already have the file_name built-in within their if-statement
file_name = f'{mode}_{data_size[0]}x{data_size[1]}-data_{data_range[0]}to{data_range[1]}-range_{degree_str.replace(" ", "")}-degree_{seed}-seed_({noise[0]},{noise[1]})-noise.csv'
np.savetxt(file_name, all_data, delimiter = ',', comments = '')
# Printing the final functional form
if 'functional_form_str' not in locals(): # Specific equations already have the functional_form_str built-in within their if-statement
functional_form_str = [f'{coefficients[idx]}*ln(x{idx})^{powers[idx]}' if is_ln[idx] else f'{coefficients[idx]}*(x{idx})^{powers[idx]}' for idx in range(X_data.shape[1])] # coefficients*log(x)^power or coeffiicents*x^power
functional_form_str = ' + '.join([elem[:-2] + elem[-2:].replace('^1', '') for elem in functional_form_str]) # Removing ^1. String addition to avoid removing ^10, ^11, ^12, ...
functional_form_str = f'y = {functional_form_str}{noise_str}'.replace('+ -', '- ')
print('The final functional form for this dataset is:')
print(functional_form_str)
print(f'The noise level is {np.mean(np.abs(noise_vals/y_data))*100:.1f}%')
# Writing a log file
time_now = '-'.join([str(elem) for elem in localtime()[:6]]) # YYYY-MM-DD-hh-mm-ss
log_text = f'Log for create_random_data()\n' + (
f'Function call: python create_random_data.py {data_size[0]} {data_size[1]} {mode} -dr {data_range[0]} {data_range[1]} -d {degree_str.replace(",", "")} -s {seed} -c {coeff_str} -n {noise[0]} {noise[1]} \n') + (
f'Functional form: {functional_form_str}\n') + (
f'Noise level: {np.mean(np.abs(noise_vals/y_data))*100:.1f}%\n') + (
f'Completion time: {time_now}\n')
with open(file_name[:-3] + 'log', 'w') as f:
f.write(log_text)
if __name__ == '__main__':
# Input setup
import argparse
parser = argparse.ArgumentParser(description = 'Creates "data_size" random points, then use them and a functional form (defined by "mode" and "degrees") to generate a y response.')
parser.add_argument('data_size', type = int, nargs = 2, help = 'The size of the X data that will be randomly created')
parser.add_argument('mode', type = str, nargs = 1, help = 'The functional form of the dataset response. Must be in {"poly", "log", "energy", "multicollinear"}. "Muticollinear" is for a specific test, and in general should not be called by the user')
parser.add_argument('-dr', '--datarange', type = int, nargs = 2, metavar = (-10, 10), default = (-10, 10), help = 'Integers representing the minimum and maximum values of the X data. ' +
'The limits are multiplied by 10, used to generate random integers, then divided by 10 to turn the X data into floats with one decimal digit.')
parser.add_argument('-d', '--degrees', type = float, nargs = '+', metavar = 1, default = [1], help = 'A list of valid degrees to which the X data is randomly raised.' +
'For example, degrees = [1, 2] will include linear and quadratic terms at random, degrees = [2] will include only quadratic terms.')
parser.add_argument('-s', '--seed', type = int, nargs = 1, metavar = 'int', default = [123456789],
help = 'The seed that is passed to np.random.default_rng() for reproducibility.')
parser.add_argument('-c', '--coefficients', type = float, nargs = '+', metavar = 'None or list', default = None,
help='The coefficients that multiply each feature of the X data. If None, they are generated at random between -10 and 10.')
parser.add_argument('-n', '--noise', type = float, nargs = 2, metavar = (0, 0), default = (0, 0), help='The mean and stdev of the Gaussian noise added to the y data. By default, no noise is added.')
myargs = parser.parse_args()
create_random_data(myargs.data_size, myargs.mode[0], tuple(myargs.datarange), myargs.degrees, myargs.seed[0], myargs.coefficients, tuple(myargs.noise))