Split subtitles handling into two handlers.

One for generation and one for import.
This commit is contained in:
Erik Thuning 2023-11-30 16:35:26 +01:00
parent 2da06f84a7
commit 10c126a37c
4 changed files with 97 additions and 44 deletions

@ -57,7 +57,7 @@ token = A70keN
# size and one other job of the next smaller size. Otherwise no jobs will run.
#
# See also TranscodeHandler.jobsize and SubtitlesHandler.jobsize
capacity = 100
capacity = 20
[Daisy]
@ -72,7 +72,7 @@ url = ldaps://ldap.example.com
base_dn = dc=example,dc=com
[SubtitlesHandler]
[SubtitlesWhisperHandler]
# The whisper model to use for subtitle generation
whispermodel = large-v2
@ -83,7 +83,7 @@ modeldir = /some/path
# The amount of resources a single whisper job will consume
# in the worker pool. This should be set so that the server can handle the
# pool getting completely filled with jobs of this type.
jobsize = 25
jobsize = 5
[ThumbnailHandler]
@ -105,7 +105,7 @@ encoder = software
# The amount of resources a single transcode job will consume
# in the worker pool. This should be set so that the server can handle the
# pool getting completely filled with jobs of this type.
jobsize = 5
jobsize = 2
[MediasiteProcessor]

@ -5,7 +5,8 @@ from .handler import Handler
from .metadata import MetadataHandler
from .poster import PosterHandler
from .slides import SlidesHandler
from .subtitles import SubtitlesHandler
from .subtitles_whisper import SubtitlesWhisperHandler
from .subtitles_import import SubtitlesImportHandler
from .thumbnail import ThumbnailHandler
from .transcode import TranscodeHandler
from ..ldap import Ldap
@ -15,7 +16,8 @@ allHandlers = [AudioHandler,
MetadataHandler,
PosterHandler,
SlidesHandler,
SubtitlesHandler,
SubtitlesImportHandler,
SubtitlesWhisperHandler,
ThumbnailHandler,
TranscodeHandler,
]

@ -0,0 +1,70 @@
import logging
from pathlib import Path
from .handler import Handler
from ..exceptions import ValidationException
@Handler.register
class SubtitlesImportHandler(Handler):
"""
This class saves uploaded subtitles to a package.
"""
@classmethod
def wants(cls, jobspec, existing_package):
"""
Return True if this handler wants to process this jobspec.
Raises an exception if the job is wanted but doesn't pass validation.
A job is wanted if the job specification contains a 'subtitles' key.
"""
if 'subtitles' in jobspec:
return cls._validate(jobspec, existing_package)
return False
@classmethod
def _validate(cls, jobspec, existing_package):
"""
Return True if the job is valid for this handler.
Validity requirements are:
- Keys in 'subtitles' and 'generate_subtitles' must be
mututally unique.
- If any value in the 'subtitles' object is not None, the job must
contain an 'upload_dir' key which must point to an
existing directory.
- All 'subtitles' values that are not None must be existing files
under 'upload_dir'.
"""
super()._validate(jobspec, existing_package)
# Check for duplicate track names
generate_names = jobspec.get('generate_subtitles', {}).keys()
store_names = jobspec.get('subtitles', {}).keys()
common_names = generate_names & store_names
if common_names:
names_string = ', '.join(common_names)
raise ValidationException(
f"Duplicate subtitle track name(s): {names_string}")
# Validate storage tasks
for name, subsfile in jobspec.get('subtitles', {}).items():
if not subsfile:
continue
if 'upload_dir' not in jobspec:
raise ValidationException("upload_dir missing")
subspath = Path(jobspec['upload_dir']) / subsfile
if not subspath.is_file():
raise ValidationException(
f"Error for subtitle track {name}: "
f"{subspath} is not a valid file")
return True
def _handle(self, jobspec, existing_package, tempdir):
def apply_func(package):
for name, subsfile in jobspec.get('subtitles', {}).items():
subspath = None
if subsfile:
subspath = Path(jobspec['upload_dir']) / subsfile
package.set_subtitle_track(name, subspath)
return apply_func

@ -16,7 +16,8 @@ def _do_whisper_transcribe(inpath,
device,
modelname,
modeldir,
language=None):
loglevel,
language=None,):
"""
Transcribe the given file at 'inpath' to a VTT file at 'outpath'
using the Whisper engine.
@ -25,7 +26,8 @@ def _do_whisper_transcribe(inpath,
"""
logger = logging.getLogger(
'play-daemon.SubtitlesHandler._do_whisper_transcribe')
'play-daemon.SubtitlesWhisperHandler._do_transcribe')
logger.setLevel(loglevel)
logger.info(f'Starting whisper transcription job for {inpath}.')
try:
whisperModel = whisper.load_model(
@ -57,14 +59,13 @@ def _do_whisper_transcribe(inpath,
elapsed = time.strftime('%H:%M:%S', time.gmtime(end - start))
logger.info('Finished whisper transcription job '
f'for {inpath} in {elapsed}.')
[handler.flush() for handler in logger.handlers]
return (outpath, out_language)
@Handler.register
class SubtitlesHandler(Handler):
class SubtitlesWhisperHandler(Handler):
"""
This class handles package subtitles.
This class handles subtitle generation with Whisper.
"""
def __init__(self,
handlerqueue,
@ -84,7 +85,7 @@ class SubtitlesHandler(Handler):
self.whispermodel = config['whispermodel']
self.modeldir = config['modeldir']
self.device = device
self.logger.debug(f'Created SubtitlesHandler on {device}')
self.logger.debug(f'Created SubtitlesWhisperHandler on {device}')
@classmethod
def instantiate(cls,
@ -95,7 +96,7 @@ class SubtitlesHandler(Handler):
tempdir,
config):
"""
Returns a list SubtitlesHandlers.
Returns a list of SubtitlesWhisperHandlers.
Instantiation behaviour is governed by two configuration values:
device and count. Both are optional.
@ -153,10 +154,10 @@ class SubtitlesHandler(Handler):
Return True if this handler wants to process this jobspec.
Raises an exception if the job is wanted but doesn't pass validation.
A job is wanted if the job specification contains a 'subtitles' or a
'generate_subtitles' key.
A job is wanted if the job specification contains
a 'generate_subtitles' key.
"""
if 'subtitles' in jobspec or 'generate_subtitles' in jobspec:
if 'generate_subtitles' in jobspec:
return cls._validate(jobspec, existing_package)
return False
@ -168,11 +169,6 @@ class SubtitlesHandler(Handler):
Validity requirements are:
- Keys in 'subtitles' and 'generate_subtitles' must be
mututally unique.
- If any value in the 'subtitles' object is not None, the job must
contain an 'upload_dir' key which must point to an
existing directory.
- All 'subtitles' values that are not None must be existing files
under 'upload_dir'.
- All 'source' values in subtitle generation specifications must be a
valid source name, either one that already exists or one provided
under 'sources' in this job.
@ -203,27 +199,15 @@ class SubtitlesHandler(Handler):
raise ValidationException(f"Subtitle track '{name}' refers "
"to a missing source: "
f"{expected_source}")
# Validate storage tasks
for name, subsfile in jobspec.get('subtitles', {}).items():
if not subsfile:
continue
if 'upload_dir' not in jobspec:
raise ValidationException("upload_dir missing")
subspath = Path(jobspec['upload_dir']) / subsfile
if not subspath.is_file():
raise ValidationException(
f"Error for subtitle track {name}: "
f"{subspath} is not a valid file")
return True
def _handle(self, jobspec, existing_package, tempdir):
"""
Return a function to apply changes to the stored package.
Any subtitle generation tasks are run before apply_func is returned.
The returned function moves subtitle files into the package's basedir
and updates the package metadata.
All subtitle generation tasks are run before apply_func is returned.
The returned function moves generated subtitle files into the
package's basedir and updates the package metadata.
Replaced subtitle tracks are deleted.
"""
@ -233,6 +217,7 @@ class SubtitlesHandler(Handler):
for trackname, item in jobspec.get('generate_subtitles', {}).items():
sourcename = item['source']
generated_name = sourcename.replace('_', '__').replace(' ', '_')
language = item.get('language', None)
sourcepath = None
source_from_job = jobspec.get('sources', {}).get(sourcename, {})
@ -250,7 +235,8 @@ class SubtitlesHandler(Handler):
sourcefile = existing_source['video'][resolutions_sorted[0]]
sourcepath = basedir / sourcefile
outpath = tempdir / f"{sourcename}.vtt"
generated_name = sourcename.replace('_', '__').replace(' ', '_')
outpath = tempdir / f"{generated_name}.vtt"
transcribe = self.asyncjob(self.jobsize,
_do_whisper_transcribe,
@ -258,7 +244,8 @@ class SubtitlesHandler(Handler):
outpath,
self.device,
self.whispermodel,
self.modeldir),
self.modeldir,
loglevel),
{'language': language})
transcribes.append(transcribe)
resultfiles[trackname] = outpath
@ -270,12 +257,6 @@ class SubtitlesHandler(Handler):
self.logger.info("Done, making apply_func")
def apply_func(package):
for name, subsfile in jobspec.get('subtitles', {}).items():
subspath = None
if subsfile:
subspath = Path(jobspec['upload_dir']) / subsfile
package.set_subtitle_track(name, subspath)
for name, subspath in resultfiles.items():
package.set_subtitle_track(name, subspath)