416 lines
16 KiB
Python
416 lines
16 KiB
Python
|
import math
|
|||
|
import torch
|
|||
|
import torch.nn as nn
|
|||
|
import cv2 as cv
|
|||
|
import numpy as np
|
|||
|
import torchvision
|
|||
|
|
|||
|
from pathlib import Path
|
|||
|
from PIL import Image
|
|||
|
from torch import Tensor
|
|||
|
from torch.nn import Module
|
|||
|
from numpy import ndarray, linalg
|
|||
|
from torchvision import transforms
|
|||
|
from ortools.algorithms.pywrapknapsack_solver import KnapsackSolver
|
|||
|
from config import DSNetConfig
|
|||
|
from typing import Iterable
|
|||
|
|
|||
|
|
|||
|
class FeatureExtractor(object):
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
super(FeatureExtractor, self).__init__()
|
|||
|
self.preprocess = transforms.Compose([
|
|||
|
transforms.Resize(256),
|
|||
|
transforms.CenterCrop(224),
|
|||
|
transforms.ToTensor(),
|
|||
|
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
|||
|
])
|
|||
|
self.model = torchvision.models.googlenet(pretrained=True)
|
|||
|
self.model = torch.nn.Sequential(*list(self.model.children())[:-2])
|
|||
|
self.model = self.model.to(DSNetConfig.device).eval()
|
|||
|
|
|||
|
def run(self, img: ndarray) -> ndarray:
|
|||
|
img : Image = Image.fromarray(img)
|
|||
|
tensor : Tensor = self.preprocess(img)
|
|||
|
batch : Tensor = tensor.unsqueeze(0)
|
|||
|
with torch.no_grad():
|
|||
|
feat : Tensor|ndarray|None = None
|
|||
|
feat = self.model(batch.to(DSNetConfig.device))
|
|||
|
feat = feat.squeeze().cpu().numpy()
|
|||
|
|
|||
|
assert feat.shape == (1024,), f'Invalid feature shape {feat.shape}: expected 1024'
|
|||
|
feat /= linalg.norm(feat) + 1e-10
|
|||
|
return feat
|
|||
|
|
|||
|
|
|||
|
class VideoPreprocessor(object):
|
|||
|
|
|||
|
def __init__(self) -> None:
|
|||
|
super(VideoPreprocessor, self).__init__()
|
|||
|
self.model = FeatureExtractor()
|
|||
|
self.sample_rate = DSNetConfig.sample_rate
|
|||
|
|
|||
|
def get_features(self, video_path: str) -> tuple[int, ndarray]:
|
|||
|
video_path = Path(video_path)
|
|||
|
video_capture = cv.VideoCapture(str(video_path))
|
|||
|
assert video_capture is not None, f'Cannot open video: {video_path}'
|
|||
|
|
|||
|
features: list[ndarray] = []
|
|||
|
n_frames: int = 0
|
|||
|
while True:
|
|||
|
ret, frame = video_capture.read()
|
|||
|
if not ret:
|
|||
|
break
|
|||
|
if n_frames % self.sample_rate == 0:
|
|||
|
frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
|
|||
|
feat = self.model.run(frame)
|
|||
|
features.append(feat)
|
|||
|
n_frames += 1
|
|||
|
video_capture.release()
|
|||
|
features = np.array(features)
|
|||
|
return n_frames, features
|
|||
|
|
|||
|
def calculate_scatters(K: ndarray) -> ndarray:
|
|||
|
n = K.shape[0]
|
|||
|
K1 = np.cumsum([0] + list(np.diag(K)))
|
|||
|
K2 = np.zeros((n + 1, n + 1))
|
|||
|
K2[1:, 1:] = np.cumsum(np.cumsum(K, 0), 1)
|
|||
|
|
|||
|
diagK2 = np.diag(K2)
|
|||
|
|
|||
|
i = np.arange(n).reshape((-1, 1))
|
|||
|
j = np.arange(n).reshape((1, -1))
|
|||
|
scatters = (
|
|||
|
K1[1:].reshape((1, -1)) - K1[:-1].reshape((-1, 1)) -
|
|||
|
(diagK2[1:].reshape((1, -1)) + diagK2[:-1].reshape((-1, 1)) -
|
|||
|
K2[1:, :-1].T - K2[:-1, 1:]) /
|
|||
|
((j - i + 1).astype(np.float32) + (j == i - 1).astype(np.float32))
|
|||
|
)
|
|||
|
scatters[j < i] = 0
|
|||
|
return scatters
|
|||
|
|
|||
|
def change_point_detect_nonlin(K: ndarray, ncp: int, lmin: int = 1, lmax: int = 100000, backtrack=True) -> tuple[ndarray, ndarray]:
|
|||
|
m = int(ncp)
|
|||
|
n, n1 = K.shape
|
|||
|
assert n == n1, 'Kernel matrix awaited.'
|
|||
|
assert (m + 1) * lmin <= n <= (m + 1) * lmax
|
|||
|
assert 1 <= lmin <= lmax
|
|||
|
|
|||
|
J = VideoPreprocessor.calculate_scatters(K)
|
|||
|
I = 1e101 * np.ones((m + 1, n + 1))
|
|||
|
I[0, lmin:lmax] = J[0, lmin - 1:lmax - 1]
|
|||
|
p = np.zeros((m + 1, n + 1), dtype=int) if backtrack else np.zeros((1, 1), dtype=int)
|
|||
|
|
|||
|
for k in range(1, m + 1): # k: 当前向视频中插入了k个变化点, 即将视频分为了(k + 1)段
|
|||
|
for l in range((k + 1) * lmin, n + 1): # l: 当序列中出现了k个变化点后, 下一个段的最小起始位置, 也即是当前段的结束位置
|
|||
|
tmin = max(k * lmin, l - lmax) # tmin: 现有的k个变化点,至少使用了k * lmin个帧, 即当前段的最小起始位置
|
|||
|
tmax = l - lmin + 1 #
|
|||
|
c = J[tmin:tmax, l - 1].reshape(-1) + \
|
|||
|
I[k - 1, tmin:tmax].reshape(-1)
|
|||
|
I[k, l] = np.min(c)
|
|||
|
if backtrack:
|
|||
|
p[k, l] = np.argmin(c) + tmin
|
|||
|
|
|||
|
cps = np.zeros(m, dtype=int)
|
|||
|
if backtrack:
|
|||
|
cur = n
|
|||
|
for k in range(m, 0, -1):
|
|||
|
cps[k - 1] = p[k, cur]
|
|||
|
cur = cps[k - 1]
|
|||
|
scores = I[:, n].copy()
|
|||
|
scores[scores > 1e99] = np.inf
|
|||
|
return cps, scores
|
|||
|
|
|||
|
def change_point_detect_auto(K: ndarray) -> tuple[ndarray, ndarray]:
|
|||
|
m, N = len(K) - 1, len(K)
|
|||
|
_, scores = VideoPreprocessor.change_point_detect_nonlin(K, m, backtrack=False)
|
|||
|
|
|||
|
penalties = np.zeros(m + 1)
|
|||
|
ncp = np.arange(1, m + 1)
|
|||
|
penalties[1:] = (ncp / (2.0 * N)) * (np.log(float(N) / ncp) + 1)
|
|||
|
|
|||
|
costs = scores / float(N) + penalties
|
|||
|
m_best = np.argmin(costs)
|
|||
|
return VideoPreprocessor.change_point_detect_nonlin(K, m_best)
|
|||
|
|
|||
|
def kernel_temporal_segment(self, n_frames: int, features: ndarray) -> tuple[ndarray, ndarray, ndarray]:
|
|||
|
seq_len = len(features)
|
|||
|
picks = np.arange(0, seq_len) * self.sample_rate
|
|||
|
|
|||
|
kernel = np.matmul(features, features.T)
|
|||
|
change_points, _ = VideoPreprocessor.change_point_detect_auto(kernel)
|
|||
|
change_points *= self.sample_rate
|
|||
|
change_points = np.hstack((0, change_points, n_frames))
|
|||
|
begin_frames = change_points[:-1]
|
|||
|
end_frames = change_points[1:]
|
|||
|
change_points = np.vstack((begin_frames, end_frames - 1)).T
|
|||
|
|
|||
|
n_frame_per_seg = end_frames - begin_frames
|
|||
|
return change_points, n_frame_per_seg, picks
|
|||
|
|
|||
|
def run(self, video_path: str) -> tuple[int, ndarray, ndarray, ndarray, ndarray]:
|
|||
|
n_frames, features = self.get_features(video_path)
|
|||
|
cps, nfps, picks = self.kernel_temporal_segment(n_frames, features)
|
|||
|
return n_frames, features, cps, nfps, picks
|
|||
|
|
|||
|
|
|||
|
class ScaledDotProductAttention(Module):
|
|||
|
def __init__(self, d_k: float):
|
|||
|
super(ScaledDotProductAttention, self).__init__()
|
|||
|
self.dropout = nn.Dropout(0.5)
|
|||
|
self.sqrt_d_k = math.sqrt(d_k)
|
|||
|
|
|||
|
def forward(self, Q: Tensor, K: Tensor, V: Tensor) -> tuple[Tensor, Tensor]:
|
|||
|
attn = torch.bmm(Q, K.transpose(2, 1))
|
|||
|
attn = attn / self.sqrt_d_k
|
|||
|
attn = torch.softmax(attn, dim=-1)
|
|||
|
attn = self.dropout(attn)
|
|||
|
y = torch.bmm(attn, V)
|
|||
|
return y, attn
|
|||
|
|
|||
|
|
|||
|
class MultiHeadAttention(nn.Module):
|
|||
|
def __init__(self, num_head: int, num_feature: int) -> None:
|
|||
|
super(MultiHeadAttention, self).__init__()
|
|||
|
self.num_head = num_head
|
|||
|
self.Q = nn.Linear(num_feature, num_feature, bias=False)
|
|||
|
self.K = nn.Linear(num_feature, num_feature, bias=False)
|
|||
|
self.V = nn.Linear(num_feature, num_feature, bias=False)
|
|||
|
self.d_k = num_feature // num_head
|
|||
|
self.attention = ScaledDotProductAttention(self.d_k)
|
|||
|
self.fc = nn.Sequential(
|
|||
|
nn.Linear(num_feature, num_feature, bias=False),
|
|||
|
nn.Dropout(0.5)
|
|||
|
)
|
|||
|
|
|||
|
def forward(self, x: Tensor) -> tuple[Tensor, Tensor]:
|
|||
|
_, seq_len, num_feature = x.shape # [1, seq_len, 1024]
|
|||
|
K: Tensor = self.K(x) # [1, seq_len, 1024]
|
|||
|
Q: Tensor = self.Q(x) # [1, seq_len, 1024]
|
|||
|
V: Tensor = self.V(x) # [1, seq_len, 1024]
|
|||
|
|
|||
|
K = K.view(1, seq_len, self.num_head, self.d_k).permute(
|
|||
|
2, 0, 1, 3).contiguous().view(self.num_head, seq_len, self.d_k)
|
|||
|
Q = Q.view(1, seq_len, self.num_head, self.d_k).permute(
|
|||
|
2, 0, 1, 3).contiguous().view(self.num_head, seq_len, self.d_k)
|
|||
|
V = V.view(1, seq_len, self.num_head, self.d_k).permute(
|
|||
|
2, 0, 1, 3).contiguous().view(self.num_head, seq_len, self.d_k)
|
|||
|
|
|||
|
y, attn = self.attention(Q, K, V) # [num_head, seq_len, d_k]
|
|||
|
y = y.view(1, self.num_head, seq_len, self.d_k).permute(
|
|||
|
0, 2, 1, 3).contiguous().view(1, seq_len, num_feature)
|
|||
|
|
|||
|
y = self.fc(y)
|
|||
|
return y, attn
|
|||
|
|
|||
|
|
|||
|
class AttentionExtractor(MultiHeadAttention):
|
|||
|
def __init__(self, *args, **kwargs):
|
|||
|
super().__init__(*args, **kwargs)
|
|||
|
|
|||
|
def forward(self, *inputs):
|
|||
|
out, _ = super().forward(*inputs)
|
|||
|
return out
|
|||
|
|
|||
|
|
|||
|
class DSNetAF(Module):
|
|||
|
|
|||
|
def __init__(self, num_feature: int, num_hidden: int, num_head: int) -> None:
|
|||
|
super(DSNetAF, self).__init__()
|
|||
|
self.base_model = AttentionExtractor(num_head, num_feature)
|
|||
|
self.layer_norm = nn.LayerNorm(num_feature)
|
|||
|
self.fc1 = nn.Sequential(
|
|||
|
nn.Linear(num_feature, num_hidden),
|
|||
|
nn.ReLU(inplace=True),
|
|||
|
nn.Dropout(0.5),
|
|||
|
nn.LayerNorm(num_hidden),
|
|||
|
)
|
|||
|
self.fc_cls = nn.Linear(num_hidden, 1)
|
|||
|
self.fc_loc = nn.Linear(num_hidden, 2)
|
|||
|
self.fc_ctr = nn.Linear(num_hidden, 1)
|
|||
|
|
|||
|
def offset2bbox(offsets: np.ndarray) -> np.ndarray:
|
|||
|
offset_left, offset_right = offsets[:, 0], offsets[:, 1]
|
|||
|
seq_len, _ = offsets.shape
|
|||
|
indices = np.arange(seq_len)
|
|||
|
bbox_left = indices - offset_left
|
|||
|
bbox_right = indices + offset_right + 1
|
|||
|
bboxes = np.vstack((bbox_left, bbox_right)).T
|
|||
|
return bboxes
|
|||
|
|
|||
|
def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
|
|||
|
_, seq_len, _ = x.shape
|
|||
|
out = self.base_model(x)
|
|||
|
out = out + x
|
|||
|
out = self.layer_norm(out)
|
|||
|
out = self.fc1(out)
|
|||
|
pred_cls = self.fc_cls(out).sigmoid().view(seq_len)
|
|||
|
pred_loc = self.fc_loc(out).exp().view(seq_len, 2)
|
|||
|
pred_ctr = self.fc_ctr(out).sigmoid().view(seq_len)
|
|||
|
return pred_cls, pred_loc, pred_ctr
|
|||
|
|
|||
|
def predict(self, seq: Tensor) -> tuple[ndarray, ndarray]:
|
|||
|
pred_cls, pred_loc, pred_ctr = self(seq)
|
|||
|
pred_cls *= pred_ctr
|
|||
|
pred_cls /= pred_cls.max() + 1e-8
|
|||
|
pred_cls = pred_cls.cpu().numpy()
|
|||
|
pred_loc = pred_loc.cpu().numpy()
|
|||
|
pred_bboxes = DSNetAF.offset2bbox(pred_loc)
|
|||
|
return pred_cls, pred_bboxes
|
|||
|
|
|||
|
|
|||
|
def iou_lr(anchor_bbox: np.ndarray, target_bbox: np.ndarray) -> np.ndarray:
|
|||
|
anchor_left, anchor_right = anchor_bbox[:, 0], anchor_bbox[:, 1]
|
|||
|
target_left, target_right = target_bbox[:, 0], target_bbox[:, 1]
|
|||
|
|
|||
|
inter_left = np.maximum(anchor_left, target_left)
|
|||
|
inter_right = np.minimum(anchor_right, target_right)
|
|||
|
union_left = np.minimum(anchor_left, target_left)
|
|||
|
union_right = np.maximum(anchor_right, target_right)
|
|||
|
|
|||
|
intersect = inter_right - inter_left
|
|||
|
intersect[intersect < 0] = 0
|
|||
|
union = union_right - union_left
|
|||
|
union[union <= 0] = 1e-6
|
|||
|
|
|||
|
iou = intersect / union
|
|||
|
return iou
|
|||
|
|
|||
|
|
|||
|
def nms(scores: np.ndarray, bboxes: np.ndarray, thresh: float) -> tuple[np.ndarray, np.ndarray]:
|
|||
|
valid_idx = bboxes[:, 0] < bboxes[:, 1]
|
|||
|
scores = scores[valid_idx]
|
|||
|
bboxes = bboxes[valid_idx]
|
|||
|
|
|||
|
arg_desc = scores.argsort()[::-1]
|
|||
|
|
|||
|
scores_remain = scores[arg_desc]
|
|||
|
bboxes_remain = bboxes[arg_desc]
|
|||
|
|
|||
|
keep_bboxes = []
|
|||
|
keep_scores = []
|
|||
|
|
|||
|
while bboxes_remain.size > 0:
|
|||
|
bbox = bboxes_remain[0]
|
|||
|
score = scores_remain[0]
|
|||
|
keep_bboxes.append(bbox)
|
|||
|
keep_scores.append(score)
|
|||
|
|
|||
|
iou = iou_lr(bboxes_remain, np.expand_dims(bbox, axis=0))
|
|||
|
|
|||
|
keep_indices = (iou < thresh)
|
|||
|
bboxes_remain = bboxes_remain[keep_indices]
|
|||
|
scores_remain = scores_remain[keep_indices]
|
|||
|
|
|||
|
keep_bboxes = np.asarray(keep_bboxes, dtype=bboxes.dtype)
|
|||
|
keep_scores = np.asarray(keep_scores, dtype=scores.dtype)
|
|||
|
return keep_scores, keep_bboxes
|
|||
|
|
|||
|
|
|||
|
def knapsack(values: Iterable[int],
|
|||
|
weights: Iterable[int],
|
|||
|
capacity: int
|
|||
|
) -> list[int]:
|
|||
|
|
|||
|
knapsack_solver = KnapsackSolver(
|
|||
|
KnapsackSolver.KNAPSACK_DYNAMIC_PROGRAMMING_SOLVER, 'test'
|
|||
|
)
|
|||
|
|
|||
|
values = list(values)
|
|||
|
weights = list(weights)
|
|||
|
capacity = int(capacity)
|
|||
|
|
|||
|
knapsack_solver.Init(values, [weights], [capacity])
|
|||
|
knapsack_solver.Solve()
|
|||
|
packed_items = [x for x in range(0, len(weights))
|
|||
|
if knapsack_solver.BestSolutionContains(x)]
|
|||
|
|
|||
|
return packed_items
|
|||
|
|
|||
|
|
|||
|
def get_keyshot_summ(pred: np.ndarray,
|
|||
|
cps: np.ndarray,
|
|||
|
n_frames: int,
|
|||
|
nfps: np.ndarray,
|
|||
|
picks: np.ndarray,
|
|||
|
proportion: float = 0.15
|
|||
|
) -> np.ndarray:
|
|||
|
assert pred.shape == picks.shape
|
|||
|
picks = np.asarray(picks, dtype=np.int32)
|
|||
|
|
|||
|
# Get original frame scores from downsampled sequence
|
|||
|
frame_scores = np.zeros(n_frames, dtype=np.float32)
|
|||
|
for i in range(len(picks)):
|
|||
|
pos_lo = picks[i]
|
|||
|
pos_hi = picks[i + 1] if i + 1 < len(picks) else n_frames
|
|||
|
frame_scores[pos_lo:pos_hi] = pred[i]
|
|||
|
|
|||
|
# Assign scores to video shots as the average of the frames.
|
|||
|
seg_scores = np.zeros(len(cps), dtype=np.int32)
|
|||
|
for seg_idx, (first, last) in enumerate(cps):
|
|||
|
scores = frame_scores[first:last + 1]
|
|||
|
seg_scores[seg_idx] = int(1000 * scores.mean())
|
|||
|
|
|||
|
# Apply knapsack algorithm to find the best shots
|
|||
|
limits = int(n_frames * proportion)
|
|||
|
packed = knapsack(seg_scores, nfps, limits)
|
|||
|
|
|||
|
# Get key-shot based summary
|
|||
|
summary = np.zeros(n_frames, dtype=np.bool_)
|
|||
|
for seg_idx in packed:
|
|||
|
first, last = cps[seg_idx]
|
|||
|
summary[first:last + 1] = True
|
|||
|
return summary
|
|||
|
|
|||
|
|
|||
|
def bbox2summary(seq_len: int,
|
|||
|
pred_cls: np.ndarray,
|
|||
|
pred_bboxes: np.ndarray,
|
|||
|
change_points: np.ndarray,
|
|||
|
n_frames: int,
|
|||
|
nfps: np.ndarray,
|
|||
|
picks: np.ndarray
|
|||
|
) -> np.ndarray:
|
|||
|
score = np.zeros(seq_len, dtype=np.float32)
|
|||
|
for bbox_idx in range(len(pred_bboxes)):
|
|||
|
lo, hi = pred_bboxes[bbox_idx, 0], pred_bboxes[bbox_idx, 1]
|
|||
|
score[lo:hi] = np.maximum(score[lo:hi], [pred_cls[bbox_idx]])
|
|||
|
|
|||
|
pred_summ = get_keyshot_summ(score, change_points, n_frames, nfps, picks)
|
|||
|
return pred_summ
|
|||
|
|
|||
|
|
|||
|
video_preprocessor = VideoPreprocessor()
|
|||
|
dsnet_af = DSNetAF(1024, 128, 8).to(DSNetConfig.device)
|
|||
|
|
|||
|
|
|||
|
def extract(video_path: str) -> Tensor:
|
|||
|
n_frames, seq, cps, nfps, picks = video_preprocessor.run(video_path)
|
|||
|
with torch.no_grad():
|
|||
|
seq_torch = torch.from_numpy(seq).unsqueeze(0).to(DSNetConfig.device)
|
|||
|
pred_cls, pred_bboxes = dsnet_af.predict(seq_torch)
|
|||
|
pred_bboxes = np.clip(pred_bboxes, 0, len(seq)).round().astype(np.int32)
|
|||
|
pred_cls, pred_bboxes = nms(pred_cls, pred_bboxes, DSNetConfig.nms_thresh)
|
|||
|
pred_summ = bbox2summary(len(seq), pred_cls, pred_bboxes, cps, n_frames, nfps, picks)
|
|||
|
cap = cv.VideoCapture(video_path)
|
|||
|
width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
|
|||
|
height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
|
|||
|
fps = cap.get(cv.CAP_PROP_FPS)
|
|||
|
|
|||
|
# create summary video writer
|
|||
|
fourcc = cv.VideoWriter.fourcc(*'mp4v')
|
|||
|
out = cv.VideoWriter(args.save_path, fourcc, fps, (width, height))
|
|||
|
|
|||
|
frame_idx = 0
|
|||
|
while True:
|
|||
|
ret, frame = cap.read()
|
|||
|
if not ret:
|
|||
|
break
|
|||
|
if pred_summ[frame_idx]:
|
|||
|
out.write(frame)
|
|||
|
frame_idx += 1
|
|||
|
out.release()
|
|||
|
cap.release()
|
|||
|
|