play-daemon-threaded/pipeline/preprocessors/arec.py

import json
import os
import re

from datetime import datetime
from pathlib import Path

from .preprocessor import Preprocessor


@Preprocessor.register
class ArecProcessor(Preprocessor):
    '''
    Preprocessing for arec jobs.

    Requires that at least CH1 be present in the upload. This is not checked
    by the validation function - it must be configured properly when the
    recorder is installed.

    CH1 is used for audio playback and subtitles generation, so should be
    the camera input.
    '''
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # This regex matches the name format for the individual channel files
        # in a capture, which is this format:
        #  some-prefix_1970_12_31_23_45_00_CH?_Name.mp4
        # The above ? represents a number between 1 and 4.
        # Capture group 1 is there to make the upload name prefix optional
        # Capture group 2 is only there to shorten the regex.
        # Capture group 3 capures the channel number,
        # capture group 4 captures the channel name.
        self.name_regex = re.compile(
            '^([^_]+_)?\d{4}(_\d{2}){5}_CH([1-4])_([^.]+).mp4$')

    def validate(self, queueitem):
        if 'upload_dir' not in queueitem.keys():
            raise KeyError('upload_dir missing from job specification.')
        upload_dir = Path(queueitem['upload_dir'])
        if not upload_dir.exists():
            raise ValueError('Specified upload_dir does not exist.')

        arec_data = self._parse_arec_json(upload_dir)
        required_strings = ['Title',
                            'Device_description']
        required_datetimes = ['Start_time',
                              'End_time']
        for key in required_strings + required_datetimes:
            if key not in arec_data:
                raise KeyError(f'{key} missing from arec json file.')

        for key in required_datetimes:
            # Will throw an exception if the format is invalid
            datetime.fromisoformat(arec_data[key])

        return True

    def _preprocess(self, job):
        jobid = job['jobid']
        queueitem = job['queueitem']
        upload_dir = Path(queueitem['upload_dir'])
        arec_data = self._parse_arec_json(upload_dir)
        raw_title = arec_data['Title']
        recorder = arec_data['Device_description']
        room_id = self.config[recorder]
        starttime_tz = datetime.fromisoformat(arec_data['Start_time'])
        starttime = starttime_tz.replace(tzinfo=None)
        endtime_tz = datetime.fromisoformat(arec_data['End_time'])
        endtime = endtime_tz.replace(tzinfo=None)

        start_timestamp = int(starttime.timestamp())
        outspec = self._init_jobspec(upload_dir,
                                     start_timestamp,
                                     raw_title)

        self._fill_jobspec_from_daisy(starttime, endtime, room_id, outspec)

        # Populate the sources
        sources = {}
        for item in upload_dir.iterdir():
            match = self.name_regex.match(item.name)
            if match:
                item_channel_no, item_channel_name = match.group(3, 4)
                source = {'video': item.name,
                          'poster': '',
                          'playAudio': False}
                if item_channel_no == '1':
                    source['playAudio'] = True
                sources[f'Channel {item_channel_no}'] = source
        outspec['sources'] = sources

        # Configure subtitle generation settings
        if 'Channel 1' in sources.keys():
            outspec['generate_subtitles'] = {'Generated':
                                             {'type': 'whisper',
                                              'source': 'Channel 1'}}

        return {'jobid': jobid,
                'jobspec': outspec}

    def _parse_arec_json(self, upload_dir: Path) -> dict:
        with open(upload_dir / 'information.json') as f:
            data = json.load(f)
            return {item['name']: item['value'] for item in data}