OpenCV+DLIB를 활용한 눈 깜박임 감지 모델(Colab)

카테고리 없음 2023. 11. 24. 21:22

동영상 강의 시청시 집중도를 계산하고, 공부 시간 측정에 반영하고 다시 강의를 들어야 하는 부분을 알려주는 모델을 만들고자 한다. 집중도를 계산하기 위해 사용될 수 있는 데이터가 너무 많기 때문에 먼저 눈 깜박임 시간 데이터만 사용해 졸음 감지 모델을 만들어 볼 예정이다.

DLIB는 OpenCV가 제공하지 않는 얼굴 랜드마크를 검출할 수 있는 라이브러리다. OpenCV의 face 모듈에도 몇가지 얼굴 랜드마크가 추가되어 있지만, 아직 구현과 지원이 미흡하기 때문에 얼굴 랜드마크 검출 분야에 널리 알려져 있는 DLIB 라이브러리를 이용해서 얼굴 랜드마크 검출을 하고자 한다.

[OpenCV+DLIB를 활용해 얼굴 랜드마크 검출하기]

1. 필요한 라이브러리 import 해오기

import numpy as np
import imutils
import dlib
import cv2
from google.colab.patches import cv2_imshow

colab에서 imshow 함수를 사용하기 위해서는 from google.colab.patches import cv2_imshow 로 import 해야 사용할 수 있다.

2. OpenCV imread 함수로 이미지 파일 받아오기

image_path = '/content/drive/MyDrive/project/data/img/face_img1.jpg' #=환경에 맞게 변경
#이미지 파일을 Numpy array 형태로 읽어오기
img = cv2.imread(image_path)
img = imutils.resize(img, width=500) #resize
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) #BGR 채널 이미지를 단일 채널, Grayscale 로 변환

원하는 얼굴 이미지 파일을 저장한 뒤, 드라이브에서 이미지 파일이 저장된 위치 경로를 image_path에 넣고 OpenCV imread 함수를 사용해 이미지 파일을 읽어온다. 이미지 파일을 Numpy array 형태로 숫자 값을 받아오고, 이 숫자는 해당 위치에서의 색을 나타낸다. imutils.resize 함수로 이미지 가로 사이즈를 500으로 resize한다. OpenCV 컬러변환 cvtColor 함수를 사용해 RGB(=BGR) 컬러를 Grayscale로 변환해 gray에 넣는다.

3. 얼굴 검출기 + 랜드마크 검출기

#얼굴 검출기
detector = dlib.get_frontal_face_detector()
#랜드마크 검출기
predictor = dlib.shape_predictor('/content/drive/MyDrive/project/lib/landmark/shape_predictor_68_face_landmarks.dat') #=환경에 맞게 변경

얼굴 랜드마크 검출기인 shape_predictor_68_face_landmarks.dat 파일은 http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2에서 다운받아서 경로를 넣어주면 된다.

4. 얼굴 영역 + 랜드마크 검출 및 표시

#얼굴 영역 검출
faces = detector(gray)
for rect in faces:
    #얼굴 영역을 좌표로 변환 후 사각형 표시
    x, y = rect.left(), rect.top()
    w, h = rect.right()-x, rect.bottom()-y
    cv2.rectangle(img, (x,y), (x+w, y+h), (0, 255, 0), 1)
	cv2.putText(img, "Face #{}".format(t + 1), (x - 10, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    #얼굴 랜드마크 검출
    shape = predictor(gray, rect)
    for i in range(68):
        #부위별 좌표 추출 및 표시
        part = shape.part(i)
        cv2.circle(img, (part.x, part.y), 2, (0, 0, 255), -1) #(Blue , Green , Red)
        cv2.putText(img, str(i), (part.x, part.y), cv2.FONT_HERSHEY_PLAIN, 
        				0.5, (255, 255, 255), 1, cv2.LINE_AA)

cv2_imshow(img)
cv2.waitKey(0)

위 코드 출력시 왼쪽 이미지가 출력된다. detector로 얼굴 영역을 검출해 faces에 넣고, 검출한 영역을 좌표로 변환 후 사각형을 그려준다. 검출된 하나의 얼굴 영역에서 랜드마크를 검출해 shape에 넣고 부위별 좌표를 추출하고 표시해준다.

detector = dlib.get_frontal_face_detector() : 얼굴 검출기 생성
predictor = dlib.shape_predictor(file) : 랜드마크 검출기 생성
rects = detector(img) : 얼굴 검출
- rects : 얼굴 좌표 배열
- rect.left() : 얼굴 영역 x 좌표
- rect.right() : 얼굴 영역 x2 좌표
- rect.top() : 얼굴 영역 y 좌표
- rect.bottom() : 얼굴 영역 y2 좌표
shape = predictor(img, rect) : 랜드마크 검출
- rect : 얼굴 영역
- shape : 랜드마크 영역
  - mark = shape.part(id) : id(0~68)로 좌표 획득
  - mart.x , mark.y : id에 대응하는 좌표

cv2.putText 함수를 사용해서 랜드마크 좌표를 표시해줄 수 있고, 필요없다면 삭제할 수 있다.

cv2.putText(image, text, org, font, fontScale, color[, thickness[, lineType[, bottomLeftOrigin]]])

5. Video(mp4) 파일에서 얼굴 랜드마크 검출 및 표시

video_path = '/content/drive/MyDrive/project/data/img/video1.mp4' #=환경에 맞게 변경
output_path = '/content/drive/MyDrive/project/data/img/video1_output.mp4'

EYES = list(range(36,48)) #- 눈 영역의 랜드마크 좌표 저장
# 영상 크기 저장
img_h = 1080
img_w = 1920

#- vedio output 저장용
w = round(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) # 카메라에 따라 값이 정상적, 비정상적
# fourcc 값 받아오기, *는 문자를 풀어쓰는 방식, *'DIVX' == 'D', 'I', 'V', 'X'
fourcc = cv2.VideoWriter_fourcc(*'DIVX')

# 프레임과 다음 프레임 사이의 간격 설정
delay = round(1000/fps)

while cap.isOpened():
    ret, frame = cap.read()

    # cv2.VideoWriter 객체 생성, 기존에 받아온 속성값 입력
    out = cv2.VideoWriter(output_path,cv2.VideoWriter_fourcc(*'DIVX'), fps, (img_w,img_h))

    if not ret:
        print('Could not read frame')
        break

    cap.grab()

    #- detect face area

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = detector(gray, 1)

    for (t,rect) in enumerate(faces):
        #얼굴 영역을 좌표로 변환 후 사각형 표시
        x, y = rect.left(), rect.top()
        w, h = rect.right()-x, rect.bottom()-y
        cv2.rectangle(frame, (x,y), (x+w, y+h), (0, 255, 0), 1)
        cv2.putText(frame, "Face #{}".format(t + 1), (x - 10, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    #얼굴 랜드마크 검출
        shape = predictor(frame, rect)
        for i in range(68):
            #부위별 좌표 추출 및 표시
            part = shape.part(i)
            cv2.circle(frame, (part.x, part.y), 2, (0, 255, 255), -1)
            
    out.write(frame) #- vedio output 저장용
    #cv2_imshow(frame)

#cap.release()
out.release()
#cv2.destroyAllWindows()

주석을 풀면 출력창에서 캡처된 프레임별로 추출된 얼굴 랜드마크를 볼 수 있다. 원래 영상으로 출력되는 것 같은데 colab에서는 영상 출력이 안되는지 프레임 개수만큼 사진들이 왕창 출력된다. 위 코드 실행 시 output_path로 지정한 경로 파일로 영상이 저장돼야 하는데, 한 프레임밖에 저장되지 않는데 이건 좀 알아봐야 할 것 같다.

[OpenCV+DLIB를 활용해 눈 깜박임 분류 훈련 및 예측]

1. 필요한 라이브러리 import

from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
import numpy as np
import matplotlib.pyplot as plt
from torchvision.transforms import transforms
import torch.optim as optim

2. data load

class eyes_dataset(Dataset):
    def __init__(self, x_file_paths, y_file_path, transform=None):
        self.x_files = x_file_paths
        self.y_files = y_file_path
        self.transform = transform

    def __getitem__(self, idx):
        x = self.x_files[idx]
        x = torch.from_numpy(x).float()

        y = self.y_files[idx]
        y = torch.from_numpy(y).float()

        return x, y

    def __len__(self):
        return len(self.x_files)

x_train = np.load('/content/drive/MyDrive/project/data/dataset/x_train.npy').astype(np.float32)  #=환경에 맞게 변경
y_train = np.load('/content/drive/MyDrive/project/data/dataset/y_train.npy').astype(np.float32)  #=환경에 맞게 변경

train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(),
])

train_dataset = eyes_dataset(x_train, y_train, transform=train_transform)

x_train.npy와 y_train.npy를 load해오고 train_dataset을 만든다. 관련 데이터셋은https://github.com/kairess/eye_blink_detector 에서 받을 수 있다.

- train_dataset 출력

fig = plt.figure(figsize=(12, 10))

for i in range(16):
    x, y = train_dataset[i]

    plt.subplot(4, 4, i+1)
    plt.imshow(x_train[i].reshape((26, 34)), cmap='gray')
    plt.title(str(y_train[i]))
    

plt.show()

데이터 16개를 plot해서 x_train에 있는 이미지와 y_train에 있는 label을 확인해본다. 눈을 뜬 경우 1, 감은 경우 0으로 라벨링 되어 있는 것을 볼 수 있다.

3. CNN 모델 구성과 학습

class Net(nn.Module):
	def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(1536, 512)
        self.fc2 = nn.Linear(512, 1)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)),2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = F.max_pool2d(F.relu(self.conv3(x)), 2)
        x = x.reshape(-1, 1536)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)


        return x

model = Net().to('cuda')
summary(model, (1,26,34))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1           [-1, 32, 26, 34]             320
            Conv2d-2           [-1, 64, 13, 17]          18,496
            Conv2d-3            [-1, 128, 6, 8]          73,856
            Linear-4                  [-1, 512]         786,944
            Linear-5                    [-1, 1]             513
================================================================
Total params: 880,129
Trainable params: 880,129
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.37
Params size (MB): 3.36
Estimated Total Size (MB): 3.74
----------------------------------------------------------------

colab에서 model = Net().to('cuda')을 사용하려면 런타임 유형을 GPU로 변환해야 한다. 모델을 계속 실행하다가 문제가 발생했는데, 알 수 없는 이유로 런타임이 중지되고 초기화되는데, 해결 방법은 런타임(GPU) 연결, 드라이브 mount 후에 바로 위 코드를 실행하면 문제 없이 실행할 수 있다. 예상하기로는 그 전에 너무 많은 데이터가 저장되어 있어서 그 다음에 CNN을 구축하려 하면 문제가 생기는 것 같다.

4. accuracy 함수 정의 및 모델 훈련, 예측 준비

def accuracy(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum / y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

# weights를 저장할 위치 지정
PATH = '/content/drive/MyDrive/project/data/dataset/weights/trained.pth' #=환경에 맞게 변경

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

model = Net()
model.to('cuda')

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

accuracy 함수는 예측값과 라벨값을 불러와 예측값을 sigmoid를 통해 확률화하고 round로 반올림해 0 또는 1로 저장해 y_pred_tag에 넣는다. 정확도를 계산해 출력해준다.

5. train 데이터로 모델 훈련

for epoch in range(epochs):
    running_loss = 0.0
    running_acc = 0.0

    model.train()

    for i, data in enumerate(train_dataloader, 0):
        input_1, labels = data[0].to('cuda'), data[1].to('cuda')

        input = input_1.transpose(1, 3).transpose(2, 3)

        optimizer.zero_grad()

        outputs = model(input)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_acc += accuracy(outputs, labels)

        if i % 80 == 79:
            print('epoch: [%d/%d] train_loss: %.5f train_acc: %.5f' % (
                epoch + 1, epochs, running_loss / 80, running_acc / 80))
            running_loss = 0.0

print("learning finish")
torch.save(model.state_dict(), PATH)

epoch: [1/10] train_loss: 0.01442 train_acc: 99.51250
epoch: [2/10] train_loss: 0.00634 train_acc: 99.88750
epoch: [3/10] train_loss: 0.00463 train_acc: 99.96250
epoch: [4/10] train_loss: 0.01392 train_acc: 99.55000
epoch: [5/10] train_loss: 0.00488 train_acc: 99.96250
epoch: [6/10] train_loss: 0.00300 train_acc: 100.00000
epoch: [7/10] train_loss: 0.00259 train_acc: 100.00000
epoch: [8/10] train_loss: 0.00220 train_acc: 100.00000
epoch: [9/10] train_loss: 0.00121 train_acc: 100.00000
epoch: [10/10] train_loss: 0.00115 train_acc: 100.00000
learning finish

epoch 10번 정도면 train 정확도가 100%가 나와 충분하다고 볼 수 있다.

6. test 데이터로 예측

#훈련 때 저장한 weight를 불러오기 위한 경로 저장
PATH = '/content/drive/MyDrive/project/data/dataset/weights/trained.pth'

x_test = np.load('/content/drive/MyDrive/project/data/dataset/x_val.npy').astype(np.float32)  #=환경에 맞게 변경
y_test = np.load('/content/drive/MyDrive/project/data/dataset/y_val.npy').astype(np.float32)  #=환경에 맞게 변경

test_transform = transforms.Compose([
    transforms.ToTensor()
])

test_dataset = eyes_dataset(x_test, y_test, transform=test_transform)

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4)

model = Net()
model.to('cuda')
model.load_state_dict(torch.load(PATH))
model.eval()

count = 0

with torch.no_grad():
    total_acc = 0.0
    acc = 0.0
    for i, test_data in enumerate(test_dataloader, 0):
        data, labels = test_data[0].to('cuda'), test_data[1].to('cuda')

        data = data.transpose(1, 3).transpose(2, 3)

        outputs = model(data)

        acc = accuracy(outputs, labels)
        total_acc += acc

        count = i

    print('avarage acc: %.5f' % (total_acc/count),'%')

print('test finish!')

avarage acc: 99.65157 %
test finish!

훈련할 때 저장한 weight를 가져와 test 데이터셋으로 예측하고 정확도를 계산했을 때 99.65%가 나오는 것을 보면 모델이 잘 만들어졌음을 알 수 있다.

import cv2
import dlib
import numpy as np
import torch
from imutils import face_utils

IMG_SIZE = (34,26)
PATH = '/content/drive/MyDrive/project/data/dataset/weights/trained.pth'

detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor('/content/drive/MyDrive/project/lib/landmark/shape_predictor_68_face_landmarks.dat')

model = Net()
model.load_state_dict(torch.load(PATH))
model.eval()

n_count = 0

모델이 잘 훈련된 것을 봤으니 직접 영상을 가져와 눈 깜박임을 감지해보도록 한다.

필요한 weight와 랜드마크 검출 라이브러리, model을 가져온다.

def crop_eye(img, eye_points):
  x1, y1 = np.amin(eye_points, axis=0)
  x2, y2 = np.amax(eye_points, axis=0)
  cx, cy = (x1 + x2) / 2, (y1 + y2) / 2

  w = (x2 - x1) * 1.2
  h = w * IMG_SIZE[1] / IMG_SIZE[0]

  margin_x, margin_y = w / 2, h / 2

  min_x, min_y = int(cx - margin_x), int(cy - margin_y)
  max_x, max_y = int(cx + margin_x), int(cy + margin_y)

  eye_rect = np.rint([min_x, min_y, max_x, max_y]).astype(np.int)

  eye_img = gray[eye_rect[1]:eye_rect[3], eye_rect[0]:eye_rect[2]]

  return eye_img, eye_rect

def predict(pred):
  pred = pred.transpose(1, 3).transpose(2, 3)

  outputs = model(pred)

  pred_tag = torch.round(torch.sigmoid(outputs))

  return pred_tag

눈 부분만 가져와 학습시킨 CNN 모델을 적용하기 위해 crop_eye 함수를 만든다.

cap = cv2.VideoCapture(video_path)

while cap.isOpened():
  ret, img_ori = cap.read()

  if not ret:
    break

  img_ori = cv2.resize(img_ori, dsize=(0, 0), fx=0.5, fy=0.5)

  img = img_ori.copy()
  gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

  faces = detector(gray)

  for face in faces:
    shapes = predictor(gray, face)
    shapes = face_utils.shape_to_np(shapes)

    eye_img_l, eye_rect_l = crop_eye(gray, eye_points=shapes[36:42])
    eye_img_r, eye_rect_r = crop_eye(gray, eye_points=shapes[42:48])


    eye_img_l = cv2.resize(eye_img_l, dsize=IMG_SIZE)
    eye_img_r = cv2.resize(eye_img_r, dsize=IMG_SIZE)
    eye_img_r = cv2.flip(eye_img_r, flipCode=1)

    eye_input_l = eye_img_l.copy().reshape((1, IMG_SIZE[1], IMG_SIZE[0], 1)).astype(np.float32)
    eye_input_r = eye_img_r.copy().reshape((1, IMG_SIZE[1], IMG_SIZE[0], 1)).astype(np.float32)


    eye_input_l = torch.from_numpy(eye_input_l)
    eye_input_r = torch.from_numpy(eye_input_r)


    pred_l = predict(eye_input_l)
    pred_r = predict(eye_input_r)
    if pred_l.item() == 0.0 and pred_r.item() == 0.0:
      n_count+=1

    else:
      n_count = 0


    if n_count > 100:
      cv2.putText(img,"Wake up", (120,160), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255),2)



    # visualize
    state_l = 'O %.1f' if pred_l > 0.1 else '- %.1f'
    state_r = 'O %.1f' if pred_r > 0.1 else '- %.1f'

    state_l = state_l % pred_l
    state_r = state_r % pred_r


    cv2.rectangle(img, tuple(eye_rect_l[0:2]), tuple(eye_rect_l[2:4]), (0, 255, 0), 1)
    cv2.rectangle(img, tuple(eye_rect_r[0:2]), tuple(eye_rect_r[2:4]), (0, 255, 0), 1)

    cv2.putText(img, state_l, tuple(eye_rect_l[0:2]), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 1)
    cv2.putText(img, state_r, tuple(eye_rect_r[0:2]), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 1)

  cv2_imshow(img)
  cv2.waitKey(0)

  if cv2.waitKey(1) == ord('q'):
    break

위에서 처럼 VideoCapture 함수를 사용해 영상을 frame으로 쪼개고 crop_eye 함수를 사용해 눈 부분만 가져와 눈을 감았는지 계산하고, cv2.rectangle 함수로 크롭해온 눈 부분을 시각화해준다.

프로젝트를 위해 집중도 감지 모델을 만들어야 하는데, 눈 깜박임 외에 머리 숙임, 등을 감지해 집중도를 감지할 수 있도록 할 예정이다. 동영상 시청시 사용자가 시작 버튼을 누르면 노트북 카메라를 입력으로 받아 OpenCV와 DILB를 활용해 얼마나 집중하고 있는지 감지하고 실시간으로 집중도를 그래프로 그려주도록 할 예정이다.

전체 코드
https://colab.research.google.com/drive/1GSVYcsoMucwyjqCvTLyRyQuyZpTV3Lcb?usp=sharing

참고 자료

이세우, "파이썬으로 만드는 OpenCV 프로젝트: 간단한 영상 입출력부터 머신러닝까지", 프로그래밍인사이트, 2019
https://github.com/kairess/eye_blink_detector

GitHub - kairess/eye_blink_detector: Eye blink(Closeness-Openess) detection using CNN (Keras)

Eye blink(Closeness-Openess) detection using CNN (Keras) - GitHub - kairess/eye_blink_detector: Eye blink(Closeness-Openess) detection using CNN (Keras)

github.com

https://ys-cs17.tistory.com/24

ABOUT ME

g-ia g-ia

[OpenCV+DLIB를 활용해 얼굴 랜드마크 검출하기]

[OpenCV+DLIB를 활용해 눈 깜박임 분류 훈련 및 예측]

티스토리툴바