import json import os import re from datetime import datetime from pathlib import Path from .preprocessor import Preprocessor @Preprocessor.register class ArecProcessor(Preprocessor): ''' Preprocessing for arec jobs. Requires that at least CH1 be present in the upload. This is not checked by the validation function - it must be configured properly when the recorder is installed. CH1 is used for audio playback and subtitles generation, so should be the camera input. ''' def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # This regex matches the name format for the individual channel files # in a capture, which is this format: # some-prefix_1970_12_31_23_45_00_CH?_Name.mp4 # The above ? represents a number between 1 and 4. # Capture group 1 is there to make the upload name prefix optional # Capture group 2 is only there to shorten the regex. # Capture group 3 capures the channel number, # capture group 4 captures the channel name. self.name_regex = re.compile( '^([^_]+_)?\d{4}(_\d{2}){5}_CH([1-4])_([^.]+).mp4$') def validate(self, queueitem): if 'upload_dir' not in queueitem.keys(): raise KeyError('upload_dir missing from job specification.') upload_dir = Path(queueitem['upload_dir']) if not upload_dir.exists(): raise ValueError('Specified upload_dir does not exist.') arec_data = self._parse_arec_json(upload_dir) required_strings = ['Title', 'Device_description'] required_datetimes = ['Start_time', 'End_time'] for key in required_strings + required_datetimes: if key not in arec_data: raise KeyError(f'{key} missing from arec json file.') for key in required_datetimes: # Will throw an exception if the format is invalid datetime.fromisoformat(arec_data[key]) return True def _preprocess(self, job): jobid = job['jobid'] queueitem = job['queueitem'] upload_dir = Path(queueitem['upload_dir']) arec_data = self._parse_arec_json(upload_dir) raw_title = arec_data['Title'] recorder = arec_data['Device_description'] room_id = self.config[recorder] starttime_tz = datetime.fromisoformat(arec_data['Start_time']) starttime = starttime_tz.replace(tzinfo=None) endtime_tz = datetime.fromisoformat(arec_data['End_time']) endtime = endtime_tz.replace(tzinfo=None) start_timestamp = int(starttime.timestamp()) outspec = self._init_jobspec(upload_dir, start_timestamp, raw_title) self._fill_jobspec_from_daisy(starttime, endtime, room_id, outspec) # Populate the sources sources = {} for item in upload_dir.iterdir(): match = self.name_regex.match(item.name) if match: item_channel_no, item_channel_name = match.group(3, 4) source = {'video': item.name, 'poster': '', 'playAudio': False} if item_channel_no == '1': source['playAudio'] = True sources[f'Channel {item_channel_no}'] = source outspec['sources'] = sources # Configure subtitle generation settings if 'Channel 1' in sources.keys(): outspec['generate_subtitles'] = {'Generated': {'type': 'whisper', 'source': 'Channel 1'}} return {'jobid': jobid, 'jobspec': outspec} def _parse_arec_json(self, upload_dir: Path) -> dict: with open(upload_dir / 'information.json') as f: data = json.load(f) return {item['name']: item['value'] for item in data}