[TenSEAL] Tutorial 1 : Training and Evaluation of Logistic Regression on Encrypted Data
MS의 TenSEAL 학습을 위해 번역한 글이다.
https://github.com/OpenMined/TenSEAL
GitHub - OpenMined/TenSEAL: A library for doing homomorphic encryption operations on tensors
A library for doing homomorphic encryption operations on tensors - OpenMined/TenSEAL
github.com
1. Setup
import torch
import tenseal as ts
import pandas as pd
import random
from time import time
# those are optional and are not necessary for training
import numpy as np
import matplotlib.pyplot as plt
torch.random.manual_seed(73)
random.seed(73)
def split_train_test(x, y, test_ratio=0.3):
idxs = [i for i in range(len(x))]
random.shuffle(idxs)
# delimiter between test and train data
delim = int(len(x) * test_ratio)
test_idxs, train_idxs = idxs[:delim], idxs[delim:]
return x[train_idxs], y[train_idxs], x[test_idxs], y[test_idxs]
def heart_disease_data():
data = pd.read_csv("./data/framingham.csv")
# drop rows with missing values
data = data.dropna()
# drop some features
data = data.drop(columns=["education", "currentSmoker", "BPMeds", "diabetes", "diaBP", "BMI"])
# balance data
grouped = data.groupby('TenYearCHD')
data = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
# extract labels
y = torch.tensor(data["TenYearCHD"].values).float().unsqueeze(1)
data = data.drop(columns=["TenYearCHD"])
# standardize data
data = (data - data.mean()) / data.std()
x = torch.tensor(data.values).float()
return split_train_test(x, y)
def random_data(m=1024, n=2):
# data separable by the line `y = x`
x_train = torch.randn(m, n)
x_test = torch.randn(m // 2, n)
y_train = (x_train[:, 0] >= x_train[:, 1]).float().unsqueeze(0).t()
y_test = (x_test[:, 0] >= x_test[:, 1]).float().unsqueeze(0).t()
return x_train, y_train, x_test, y_test
# You can use whatever data you want without modification to the tutorial
# x_train, y_train, x_test, y_test = random_data()
x_train, y_train, x_test, y_test = heart_disease_data()
print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")
데이터셋은 a 10-year risk of future coronary heart disease (CHD)를 사용한다. Kaggle에 있다.
2. Training a Logistic Regression model
LR을 사용하는 이유는 일반적인 ai model에서 단일 layer 학습과 과정이 동일하기 때문이다. 우선은 plaintext로 학습을 진행한다.
class LR(torch.nn.Module):
def __init__(self, n_features):
super(LR, self).__init__()
self.lr = torch.nn.Linear(n_features, 1)
def forward(self, x):
out = torch.sigmoid(self.lr(x))
return out
def train(model, optim, criterion, x, y, epochs=EPOCHS):
for e in range(1, epochs + 1):
optim.zero_grad()
out = model(x)
loss = criterion(out, y)
loss.backward()
optim.step()
print(f"Loss at epoch {e}: {loss.data}")
return model
def accuracy(model, x, y):
out = model(x)
correct = torch.abs(y - out) < 0.5
return correct.float().mean()
n_features = x_train.shape[1]
model = LR(n_features)
# use gradient descent with a learning_rate=1
optim = torch.optim.SGD(model.parameters(), lr=1)
# use Binary Cross Entropy Loss
criterion = torch.nn.BCELoss()
# define the number of epochs for both plain and encrypted training
EPOCHS = 5
model = train(model, optim, criterion, x_train, y_train)
# accuracy
plain_accuracy = accuracy(model, x_test, y_test)
print(f"Accuracy on plain test_set: {plain_accuracy}")
Accuracy는 대충 0.7정도 나온다. 동형암호로 encrypted된 상태에서도 유사한 accuracy 나오는 것이 중요하다.
3. Encrypted Evaluation
우선 암호화된 데이터를 처리할 수 있는 모델을 만든다.
class EncryptedLR:
def __init__(self, torch_lr):
# TenSEAL processes lists and not torch tensors,
# so we take out the parameters from the PyTorch model
self.weight = torch_lr.lr.weight.data.tolist()[0]
self.bias = torch_lr.lr.bias.data.tolist()
def forward(self, enc_x):
# We don't need to perform sigmoid as this model
# will only be used for evaluation, and the label
# can be deduced without applying sigmoid
enc_out = enc_x.dot(self.weight) + self.bias
return enc_out
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
################################################
## 아래의 함수를 사용하면, model을 암호화할 수 있음 ##
################################################
def encrypt(self, context):
self.weight = ts.ckks_vector(context, self.weight)
self.bias = ts.ckks_vector(context, self.bias)
def decrypt(self, context):
self.weight = self.weight.decrypt()
self.bias = self.bias.decrypt()
# create model
eelr = EncryptedLR(model)
# parameters
poly_mod_degree = 4096
coeff_mod_bit_sizes = [40, 20, 40] # 단일 곱셈 가능
# create TenSEALContext
ctx_eval = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
# scale of ciphertext to use
ctx_eval.global_scale = 2 ** 20
# this key is needed for doing dot-product operations
ctx_eval.generate_galois_keys()
# encrypt test dataset
t_start = time()
enc_x_test = [ts.ckks_vector(ctx_eval, x.tolist()) for x in x_test]
t_end = time()
# (optional) encrypt the model's parameters
# eelr.encrypt(ctx_eval)
print(f"Encryption of the test-set took {int(t_end - t_start)} seconds")
forwarding에 activation function은 없고 output을 decrypt해서 일반적인 sigmoid를 사용하는 식으로 구현한다. 일단 암호화된 상태에서 inference가 가능한지 확인하기 위해서이다. Test 코드는 다음과 같다.
def encrypted_evaluation(model, enc_x_test, y_test):
t_start = time()
correct = 0
for enc_x, y in zip(enc_x_test, y_test):
# encrypted evaluation
enc_out = model(enc_x)
# plain comparison
out = enc_out.decrypt()
out = torch.tensor(out)
out = torch.sigmoid(out)
if torch.abs(out - y) < 0.5:
correct += 1
t_end = time()
print(f"Evaluated test_set of {len(x_test)} entries in {int(t_end - t_start)} seconds")
print(f"Accuracy: {correct}/{len(x_test)} = {correct / len(x_test)}")
return correct / len(x_test)
encrypted_accuracy = encrypted_evaluation(eelr, enc_x_test, y_test)
diff_accuracy = plain_accuracy - encrypted_accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
print("Oh! We got a better accuracy on the encrypted test-set! The noise was on our side...")
평균적으로 accuracy는 비슷하게 0.7로 나온다.
4. Training an Encrypted LR model on Encrypted data
이 파트에서는 암호화된 데이터를 이용해 모델을 학습하는 것을 구현한다.
$$ Loss(\theta) = - {1 \over m} \sum_{i=1}^m [y_i log(y^h_i) + (1-y_i)log(1-y^h_i)] + {\lambda \over 2m} \sum_{j=1}^n \theta^2_j $$
파라미터 업데이트는 다음의 식을 따르게 된다.
$$ \theta_j = \theta_j - \alpha[{1 \over m} \sum^m_{i=1}(y^h_i-y_i)x_i+ {\lambda \over m}\theta_j] $$
동형암호의 제약으로 인해 $\alpha=1$, $\lambda / m = 0.05$로 사용한다.
Simoid function의 경우, 근사 알고리즘을 사용한다. 한 논문에 따르면, $[-5,5]$에 대해서 sigmoid는 다음처럼 정의해서 사용할 수 있다.
$$ sigmoid (x) = 0.5 + 0.197x - 0.004x^3 $$
그래서 최종적으로 input data부터 parameter update까지 6번의 곱셈이 필요해 진다.
- dot product 1번
- sigmoid approximation 2번
- backpropagation 3번
class EncryptedLR:
def __init__(self, torch_lr):
self.weight = torch_lr.lr.weight.data.tolist()[0]
self.bias = torch_lr.lr.bias.data.tolist()
# we accumulate gradients and counts the number of iterations
self._delta_w = 0
self._delta_b = 0
self._count = 0
def forward(self, enc_x):
enc_out = enc_x.dot(self.weight) + self.bias # mul 1회
enc_out = EncryptedLR.sigmoid(enc_out) # mul 2회
return enc_out
def backward(self, enc_x, enc_out, enc_y):
out_minus_y = (enc_out - enc_y)
self._delta_w += enc_x * out_minus_y # mul 1회
self._delta_b += out_minus_y
self._count += 1
def update_parameters(self):
if self._count == 0:
raise RuntimeError("You should at least run one forward iteration")
# update weights
# We use a small regularization term to keep the output
# of the linear layer in the range of the sigmoid approximation
self.weight -= self._delta_w * (1 / self._count) + self.weight * 0.05 # mul 1회
self.bias -= self._delta_b * (1 / self._count) # mul 1회
# reset gradient accumulators and iterations count
self._delta_w = 0
self._delta_b = 0
self._count = 0
@staticmethod
def sigmoid(enc_x):
# We use the polynomial approximation of degree 3
# sigmoid(x) = 0.5 + 0.197 * x - 0.004 * x^3
# from https://eprint.iacr.org/2018/462.pdf
# which fits the function pretty well in the range [-5,5]
return enc_x.polyval([0.5, 0.197, 0, -0.004])
def plain_accuracy(self, x_test, y_test):
# evaluate accuracy of the model on
# the plain (x_test, y_test) dataset
w = torch.tensor(self.weight)
b = torch.tensor(self.bias)
out = torch.sigmoid(x_test.matmul(w) + b).reshape(-1, 1)
correct = torch.abs(y_test - out) < 0.5
return correct.float().mean()
def encrypt(self, context):
self.weight = ts.ckks_vector(context, self.weight)
self.bias = ts.ckks_vector(context, self.bias)
def decrypt(self):
self.weight = self.weight.decrypt()
self.bias = self.bias.decrypt()
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)
# parameters
poly_mod_degree = 8192
coeff_mod_bit_sizes = [40, 21, 21, 21, 21, 21, 21, 40]
# create TenSEALContext
ctx_training = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
ctx_training.global_scale = 2 ** 21
ctx_training.generate_galois_keys()
t_start = time()
enc_x_train = [ts.ckks_vector(ctx_training, x.tolist()) for x in x_train]
enc_y_train = [ts.ckks_vector(ctx_training, y.tolist()) for y in y_train]
t_end = time()
print(f"Encryption of the training_set took {int(t_end - t_start)} seconds")
eelr = EncryptedLR(LR(n_features))
accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"Accuracy at epoch #0 is {accuracy}")
times = []
for epoch in range(EPOCHS):
eelr.encrypt(ctx_training) # model encryption
# if you want to keep an eye on the distribution to make sure
# the function approximation is still working fine
# WARNING: this operation is time consuming
# encrypted_out_distribution(eelr, enc_x_train)
t_start = time()
for enc_x, enc_y in zip(enc_x_train, enc_y_train):
enc_out = eelr.forward(enc_x)
eelr.backward(enc_x, enc_out, enc_y)
eelr.update_parameters()
t_end = time()
times.append(t_end - t_start)
eelr.decrypt()
accuracy = eelr.plain_accuracy(x_test, y_test)
print(f"Accuracy at epoch #{epoch + 1} is {accuracy}")
print(f"\nAverage time per epoch: {int(sum(times) / len(times))} seconds")
print(f"Final accuracy is {accuracy}")
diff_accuracy = plain_accuracy - accuracy
print(f"Difference between plain and encrypted accuracies: {diff_accuracy}")
if diff_accuracy < 0:
print("Oh! We got a better accuracy when training on encrypted data! The noise was on our side...")
Final accuracy는 0.66 정도가 나오며 0.04정도의 정확도 차이가 있지만, 학습이 가능한 것을 알 수 있다.
'잡기술 > Homomorphic Encryption' 카테고리의 다른 글
[TenSEAL] Tutorial 2 : Working with Approximate Numbers (0) | 2024.07.02 |
---|