安全检测


某比赛要求在施工通过监控对没带安全帽的人进行报警

先吐槽一下比赛的主办方、给的测试视屏画质极低拍摄极为敷衍、有些人连人眼都无法识别是否带了安全帽、这小小的比赛大概整了整个51假期吧、

简单介绍

这里主要提供一下思路、传统ssd(高配电脑fater-rcnn走起)+inception3、你可能会问为什么不直接用ssd进行二次训练就好了、我当初也是这么想的这不是很简单么、
然后我先把视频一帧帧的读取并转化成图像然后手动lable(这里有个问题就是一个图像中有多个人这样训练的时候会不会造成无法收敛?我觉得会有很大的影响)、
然后训练这个像打了码一样的图片(再次吐槽一下主办方)、结果连人都识别不出来!!!内心极度奔溃、然后就用了独创非主流方法

具体步骤(非主流方法请勿模仿、)

鉴于之前连人都识别出来的问题、我就直接调用ssd先去除人、然后对有戴和没戴安全帽的进行训练(通过inception3)、然后运行通过ssd的目标检测结果输入到inception3中进行判别
判别的结果传给之前的显示字符串然后进行输出、下面附上源码(目录与object_detection一致)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#视频的读取得到识别物体后显示出来
import os
import cv2
import time
import numpy as np
import tensorflow as tf

from utils.app_utils import FPS
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util

CWD_PATH = os.getcwd()

MODEL_NAME = 'ssd_mobilenet_v1_coco_11_06_2017'
PATH_TO_CKPT = os.path.join(CWD_PATH, 'object_detection', MODEL_NAME, 'frozen_inference_graph.pb')
PATH_TO_LABELS = os.path.join(CWD_PATH, 'object_detection', 'data', 'mscoco_label_map.pbtxt')

NUM_CLASSES = 2
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)

categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES,
use_display_name=True)

category_index = label_map_util.create_category_index(categories)

def detect_objects(image_np, sess, detection_graph):
# 增加输入图像的维度: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# 得到检测框
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
#得到他的得分
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
# 这里的class是包含多个识别种类的二维数组
#[[100,4]]boxes 每个框的位置坐标, scores 100个 , classes 100个 , num_detections 100个
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=4,
min_score_thresh=0.5)
return image_np

if __name__ == '__main__':
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
video_capture = cv2.VideoCapture('b.mp4')
fps = FPS().start()
frame_width = int(video_capture.get(3))
frame_height = int(video_capture.get(4))
# define video output
out = cv2.VideoWriter('outpy.mp4', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height))
count = 0
while video_capture.isOpened():
ret, frame = video_capture.read()
t = time.time()
detected_image = detect_objects(frame, sess, detection_graph)
fps.update()
cv2.imshow('Video', detected_image)
#本来想来做个更加流畅的优化、就是格一个帧进行识别、但还是会阻塞
#if count % 100 == 0:
# print(count)
# write to video file
#out.write(detected_image)
# print('[INFO] elapsed time: {:.2f}'.format(time.time() - t))
if cv2.waitKey(1) & 0xFF == ord('q'):
break

fps.stop()
video_capture.release()
sess.close()
cv2.destroyAllWindows()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#visualization_untils
#第160行进行如下修改、check为inception3的入口、将图片和坐标传入
if use_normalized_coordinates:
(left, right, top, bottom) = (xmin * im_width, xmax * im_width,
ymin * im_height, ymax * im_height)

name=check(image.copy(), left, right, top, bottom)


##188 行处
#name为全局变量、接受inception3识别结果的字符串
draw.text(
(left + margin, text_bottom - text_height - margin),
name,
fill='black',
font=font)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

#check模块、inception3的入口
import tensorflow as tf
import numpy as np
from pylab import array

def check(image,left, right, top, bottom):
got = array(image)
crop_img = got[int(top):int(bottom), int(left):int(right), 0:3]
#载入之前自己训练的模型
with tf.gfile.FastGFile('output_graph.pb', 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
tf.import_graph_def(graph_def, name='')

with tf.Session() as sess:
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
#将传入的图片格式转化一下
first = tf.image.convert_image_dtype(crop_img, dtype=tf.float32)
# jpeg 进行编码
# eval()想当于将tensorflow的存储格式中提取出来以数组的格式
encode = tf.image.encode_jpeg(first.eval())
#将编码好的图片传入以decodejpeg的格式
predictions = sess.run(softmax_tensor, {'DecodeJpeg/contents:0': encode.eval()}) # 图片格式是jpeg格式
predictions = np.squeeze(predictions) # 把结果转为1维数据
top_k = predictions.argsort()[::-1]
if top_k[0]==1:
human_string="unsafe"
else:
human_string="safe"
return human_string
#返回给画框的代码

总结

看似十分完美流程的过程在实际运行时由于笔记本配置低下(好想要GPU的台式机!!)、换了一台配置稍微高一点的本、但还是崩了、tensorflow开两个session的内存消耗比想象中的要大、开
看来这操作只能是活在梦里了、希望以后能想出一种底层之间的优化(相比之前的已经做了很多IO的优化、但主要问题还是这是线性的操作、一定有卡顿来进行二次判断)

更新!!!

终于找到了问题所在!!原来每一帧的图像传入后都要重新加载一次graph!!所以导致内存直接爆炸!改动后可以跑的动了、但比较吃配置配置高一点的话可以更加流畅吧、
具体改动如下、其余的改动就是要在每个调用的visualization_utils中的函数里传入初始化的graph、具体修改如下、整个项目会放到github上

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#主要是对main函数下的修改vediondetection.py

if __name__ == '__main__':
#tf.Graph()生成新的图
detection_graph = tf.Graph()
inceptionsess =tf.Graph()
with inceptionsess.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.FastGFile('output_graph.pb', 'rb') as f:
serialized_graph = f.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')

with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
sess = tf.Session(graph=detection_graph)
video_capture = cv2.VideoCapture('b.mp4')
fps = FPS().start()
frame_width = int(video_capture.get(3))
frame_height = int(video_capture.get(4))
# define video output
out = cv2.VideoWriter('outpy.mp4', cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (frame_width, frame_height))
count = 0
while video_capture.isOpened():
ret, frame = video_capture.read()
t = time.time()
detected_image = detect_objects(frame, sess, detection_graph,inceptionsess)
fps.update()
out.write(detected_image)
cv2.imshow('Video', detected_image)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
fps.stop()
video_capture.release()
sess.close()
cv2.destroyAllWindows()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#对checker类的方法进行的改动

def check(image,left, right, top, bottom,inceptionsess):
got = array(image)
crop_img = got[int(top):int(bottom), int(left):int(right), 0:3]
# with tf.gfile.FastGFile('output_graph.pb', 'rb') as f:
# graph_def = tf.GraphDef()
# graph_def.ParseFromString(f.read())
# tf.import_graph_def(graph_def, name='')
with tf.Session(graph=inceptionsess) as sess:
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
# jpeg 进行编码
# """Return the value of the tensor represented by this handle.""
encode = tf.image.encode_jpeg(crop_img)
predictions = sess.run(softmax_tensor, {'DecodeJpeg/contents:0': encode.eval()}) # 图片格式是jpg格式
predictions = np.squeeze(predictions) # 把结果转为1维数据
top_k = predictions.argsort()[::-1]
if top_k[0]==1:
human_string="unsafe"
else:
human_string="safe"
return human_string
文章目录
  1. 1. 简单介绍
  2. 2. 具体步骤(非主流方法请勿模仿、)
  3. 3. 总结
  4. 4. 更新!!!