forked from tanelp/tiny-diffusion
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
155 lines (127 loc) · 5.18 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
import pandas as pd
import math
import torch
from sklearn.datasets import make_moons
from torch.utils.data import TensorDataset
def moons_dataset(n=8000):
X, _ = make_moons(n_samples=n, random_state=42, noise=0.03)
X[:, 0] = (X[:, 0] + 0.3) * 2 - 1
X[:, 1] = (X[:, 1] + 0.3) * 3 - 1
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def line_dataset(n=8000):
rng = np.random.default_rng(42)
x = rng.uniform(-0.5, 0.5, n)
y = rng.uniform(-1, 1, n)
X = np.stack((x, y), axis=1)
X *= 4
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
# def circle_dataset(n=8000):
# rng = np.random.default_rng(42)
# x = np.round(rng.uniform(-0.5, 0.5, n)/2, 1)*2
# y = np.round(rng.uniform(-0.5, 0.5, n)/2, 1)*2
# norm = np.sqrt(x**2 + y**2) + 1e-10
# x /= norm
# y /= norm
# theta = 2 * np.pi * rng.uniform(0, 1, n)
# r = rng.uniform(0, 0.03, n)
# x += r * np.cos(theta)
# y += r * np.sin(theta)
# X = np.stack((x, y), axis=1)
# X *= 3
# return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def circle_dataset(n=8000):
rng = np.random.default_rng(42)
# Generate circle points
x = np.round(rng.uniform(-0.5, 0.5, n)/2, 1)*2
y = np.round(rng.uniform(-0.5, 0.5, n)/2, 1)*2
norm = np.sqrt(x**2 + y**2) + 1e-10
x /= norm
y /= norm
# Add noise
theta = 2 * np.pi * rng.uniform(0, 1, n)
r = rng.uniform(0, 0.03, n)
x += r * np.cos(theta)
y += r * np.sin(theta)
# Eyes: adding points for two small circles
for eye_x in [-0.5, 0.5]: # x-coordinates for left and right eyes
eye_y = 0.5 # y-coordinate (same for both eyes)
eye_radius = 0.2
t = 2 * np.pi * rng.uniform(0, 1, n//20) # divide by 20 to have fewer points for eyes
eye_points_x = eye_x + eye_radius * np.cos(t)
eye_points_y = eye_y + eye_radius * np.sin(t)
x = np.concatenate([x, eye_points_x])
y = np.concatenate([y, eye_points_y])
# Mouth: adding points for a semi-circle
mouth_radius = 0.5
t = np.pi * rng.uniform(0, 1, n//10) # divide by 10 to have fewer points for mouth
mouth_points_x = mouth_radius * np.cos(t)
mouth_points_y = -(0.5 + mouth_radius * np.sin(t) - mouth_radius) # adjust y to position the mouth correctly
x = np.concatenate([x, mouth_points_x])
y = np.concatenate([y, mouth_points_y])
X = np.stack((x, y), axis=1)
X *= 3
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def dino_dataset(n=8000):
df = pd.read_csv("static/DatasaurusDozen.tsv", sep="\t")
df = df[df["dataset"] == "dino"]
rng = np.random.default_rng(42)
ix = rng.integers(0, len(df), n)
x = df["x"].iloc[ix].tolist()
x = np.array(x) + rng.normal(size=len(x)) * 0.15
y = df["y"].iloc[ix].tolist()
y = np.array(y) + rng.normal(size=len(x)) * 0.15
x = (x/54 - 1) * 4
y = (y/48 - 1) * 4
X = np.stack((x, y), axis=1)
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def get_dataset(name, n=8000):
if name == "moons":
return moons_dataset(n)
elif name == "dino":
return dino_dataset(n)
elif name == "line":
return line_dataset(n)
elif name == "circle":
return circle_dataset(n)
elif name == "blob":
return blob_dataset(n)
else:
raise ValueError(f"Unknown dataset: {name}")
def sample_points_in_oval(num_points, a, b, center_x=0, center_y=0, rotation_angle=0):
"""
Samples points inside an oval with a customizable center and a probability decreasing with distance from the origin.
Parameters:
num_points (int): Number of points to sample.
a (float): Length of the major axis.
b (float): Length of the minor axis.
center_x (float): X-coordinate of the oval's center.
center_y (float): Y-coordinate of the oval's center.
rotation_angle (float): Angle of rotation of the oval in degrees. Default is 0.
Returns:
np.array: An array of points inside the oval.
"""
# Rotation matrix
theta = np.radians(rotation_angle)
rotation_matrix = np.array([[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)]])
# Sample points
points = []
while len(points) < num_points:
x, y = np.random.uniform(-a + center_x, a + center_x), np.random.uniform(-b + center_y, b + center_y)
distance_from_origin = np.sqrt((x - center_x)**2 + (y - center_y)**2)
max_distance = np.sqrt(a**2 + b**2)
probability_threshold = 1 - (distance_from_origin / max_distance)
if ((x - center_x)**2 / a**2) + ((y - center_y)**2 / b**2) <= 1 and np.random.random() < probability_threshold:
rotated_point = np.dot(rotation_matrix, np.array([x, y]))
points.append(rotated_point)
return np.array(points)
def blob_dataset(n=8000):
print("BLOB")
rng = np.random.default_rng(42)
points = sample_points_in_oval(n, 0.2, 0.3, center_x=0.7, center_y=0.2, rotation_angle=30)
x = np.array([point[0] for point in points])
y = np.array([point[1] for point in points])
X = np.stack((x, y), axis=1)
X *= 4
return TensorDataset(torch.from_numpy(X.astype(np.float32)))