import * as React from "react";

// https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.Speech.StreamingRecognize
export type SpeechRecognitionAlternative = {
  transcript: string,
  confidence: number,
  words: Array<WordInfo>,
};

type Duration = { seconds: string, nanos: number };

export type WordInfo = {
  startTime: Duration,
  endTime: Duration,
  word: string,
  speakerTag: number,
};

export type StreamingRecognitionResult = {
  alternatives: Array<SpeechRecognitionAlternative>,
  isFinal: boolean,
  stability: number,
  resultEndTime: Duration,
  channelTag: number,
};

type Status = {
  code: number,
  message: string,
  details: Array<any>,
};

export type StreamingRecognizeResponse = {
  error: Status | null,
  results: Array<StreamingRecognitionResult>,
  speech_event_type: "SPEECH_EVENT_UNSPECIFIED" | "END_OF_SINGLE_UTTERANCE",
};

export type ResponseWithTimestamp = {
  response: StreamingRecognizeResponse,
  timestamp: number,
  sampleRate: number,
  byteOffset: number,
};

function captureException(e: any) {
  console.error(e);
}

//
// This is a React hook to hook a MediaStream up to a transcription server process.
//
// Audio is sent over a WebSocket in LINEAR16 format. Google recommends a lossless format for
// the best results. Alternatively, we could use FLAC instead to save some bandwidth. See
// this project for an example:
//   https://github.com/mmig/speech-to-flac
//   https://github.com/mmig/libflac.js
//
export function useTranscriptionServer(
  jwt: string,
  mediaStream: MediaStream | null,
  languageCode: string | null
): [number, ResponseWithTimestamp | null] {
  // Manage WebSocket connection to transcription server
  const wsRef = React.useRef<WebSocket | null>(null);
  const [error, setError] = React.useState(false);
  const [
    lastResponse,
    setLastResponse,
  ] = React.useState<ResponseWithTimestamp | null>(null);
  const [readyState, setReadyState] = React.useState(-1);
  const connected = readyState === 1;
  const [reconnectCount, setReconnectCount] = React.useState(0);
  const [sampleRate, setSampleRate] = React.useState<number | null>(null);

  const audioBytesSentRef = React.useRef(0);

  //
  // Create a WebSocket connection to the transcription server.
  //
  React.useEffect(() => {
    if (sampleRate != null) {
      const protocol = window.location.protocol === "https:" ? "wss:" : "ws:";
      const port = window.location.hostname === "localhost" ? ":8181" : "";
      const url = `${protocol}//${window.location.hostname}${port}/ws/`;
      console.log(`Connecting to transcription server on ${url}`);

      const ws = new WebSocket(url);

      setError(false);
      setReadyState(ws.readyState);

      let byteOffset = 0;

      function openHandler() {
        console.log("connected");

        const encoding = "LINEAR16";
        const request = { type: "audio-start", jwt, encoding, sampleRate };

        ws.send(JSON.stringify(request));

        setReadyState(ws.readyState);
      }
      function closeHandler() {
        setReadyState(ws.readyState);
      }
      // We only get an event, not an actual error. As far as I can tell, this is only triggered
      // when we can't initiate a connection to the server.
      function errorHandler(event: Event) {
        console.log("error connecting to transcription server");
        setReadyState(ws.readyState);
        setError(true);
      }
      function messageHandler(event: MessageEvent) {
        try {
          if (typeof event.data === "string") {
            const parsed = JSON.parse(event.data);
            if (typeof parsed.byteOffset === "number") {
              byteOffset = parsed.byteOffset;
            } else {
              if (sampleRate != null) {
                setLastResponse({
                  response: parsed,
                  timestamp: Date.now(),
                  sampleRate,
                  byteOffset,
                });
              }
            }
          } else {
            throw new Error("Unexpected data type: " + typeof event.data);
          }
        } catch (error) {
          captureException(error);
        }
      }

      ws.addEventListener("open", openHandler);
      ws.addEventListener("close", closeHandler);
      ws.addEventListener("error", errorHandler);
      ws.addEventListener("message", messageHandler);

      wsRef.current = ws;

      return () => {
        ws.removeEventListener("open", openHandler);
        ws.removeEventListener("close", closeHandler);
        ws.removeEventListener("error", errorHandler);
        ws.removeEventListener("message", messageHandler);
        ws.close();
        setReadyState(-1);
      };
    }
  }, [jwt, sampleRate, reconnectCount]);

  React.useEffect(() => {
    // 1 = OPEN
    if (readyState === 1 && sampleRate != null) {
      if (languageCode == null) {
        if (wsRef.current) {
          wsRef.current.send(JSON.stringify({ type: "transcriptions-off" }));
        }
      } else {
        const streamingRecognizeRequest = {
          config: {
            enableWordTimeOffsets: true,
            languageCode,
          },
          interimResults: true,
        };

        const request = {
          type: "streamingRecognizeRequest",
          streamingRecognizeRequest,
        };

        if (wsRef.current) {
          wsRef.current.send(JSON.stringify(request));
        }
      }
    }
  }, [readyState, sampleRate, languageCode]);

  //
  // Handle reconnection logic
  //
  React.useEffect(() => {
    if (readyState === 3) {
      // 3 = CLOSED
      // Use an exponential backoff algorithm to manage reconnections
      // https://stackoverflow.com/a/37038217
      const reconnectIn = Math.pow(2, reconnectCount) * Math.random() * 1000;

      console.log("reconnecting in", reconnectIn);
      let timeoutId: ReturnType<typeof setTimeout> | null = setTimeout(() => {
        setReconnectCount((count) => count + 1);
        timeoutId = null;
      }, reconnectIn);

      return () => {
        if (timeoutId != null) {
          clearTimeout(timeoutId);
        }
      };
    }
  }, [readyState, reconnectCount]);

  //
  // Build an AudioContext
  //
  const [audioContext, setAudioContext] = React.useState<AudioContext | null>(
    null
  );
  const [audioContextState, setAudioContextState] = React.useState<string | null>(null);
  const [downsamplingFailed, setDownsamplingFailed] = React.useState(false);

  React.useEffect(() => {
    let ctx: AudioContext;
    try {
      if (downsamplingFailed) {
        ctx = new AudioContext();
      } else {
        // $FlowFixMe: Using a non-standard options parameter.
        ctx = new AudioContext({ sampleRate: 16000 });
      }
    } catch (e) {
      setAudioContext(null);
      setAudioContextState(null);
      captureException(e);
      return;
    }

    setAudioContext(ctx);
    setAudioContextState(ctx.state);

    function onStateChange() {
      setAudioContextState(ctx.state);
    }

    ctx.addEventListener("statechange", onStateChange);

    return () => {
      ctx.removeEventListener("statechange", onStateChange);
      ctx.close();
    };
  }, [downsamplingFailed]);

  //
  // Create an AudioNode from the user media stream.
  //
  const [audioSource, setAudioSource] = React.useState<MediaStreamAudioSourceNode | null>(null);
  React.useEffect(() => {
    if (mediaStream == null || audioContext == null) return;

    try {
      if (audioContextState === "suspended") {
        console.log(audioContext.state);
        console.log("trying to resume AudioContext");
        audioContext.resume();
        return;
      }

      const track = mediaStream.getAudioTracks()[0];
      if (track == null) {
        console.log("MediaSource has no audio tracks");
        return;
      }

      // $FlowFixMe: types for noiseSuppression missing
      if (track.getSettings().noiseSuppression) {
        console.warn(
          "The audio being used for transcriptions has noiseSuppression enabled. Audio signal should be clean."
        );
      }

      try {
        const source = audioContext.createMediaStreamSource(mediaStream);
        setAudioSource(source);
      } catch (error: any) {
        if (error.name === "NotSupportedError") {
          console.warn(
            "NotSupportedError when trying to capture audio from media stream. This is expected in Firefox. Retrying with a reconfigured AudioContext."
          );
          setDownsamplingFailed(true);
        } else {
          console.error(error);
          captureException(error);
        }
        return;
      }

      // This will trigger the WebSocket to connect
      setSampleRate(audioContext.sampleRate);

      return () => {
        setSampleRate(null);
      };
    } catch (error) {
      console.error(error);
      captureException(error);
    }
  }, [audioContext, mediaStream, audioContextState]);

  React.useEffect(() => {
    if (audioSource == null || audioContext == null) return;

    try {
      // According to google, 100ms is a good frame size. This will pick a power of two number
      // that is close to that for the given sample size.
      //
      // According to the spec, these are the only valid buffer sizes:
      //  256, 512, 1024, 2048, 4096, 8192, 16384
      // https://webaudio.github.io/web-audio-api/#scriptprocessornode
      let frameSize;

      if (audioContext.sampleRate === 8000) {
        frameSize = 1024;
      } else if (audioContext.sampleRate === 16000) {
        frameSize = 2048;
      } else {
        // Sample rate is probably 44100 or 48000
        frameSize = 4096;
      }

      const channelCountIn = audioSource.channelCount;
      const channelCountOut = audioContext.destination.channelCount;

      const scriptNode = audioContext.createScriptProcessor(
        frameSize,
        channelCountIn,
        channelCountOut
      );

      audioBytesSentRef.current = 0;

      function onAudioProcess(event: AudioProcessingEvent) {
        if (wsRef.current != null && wsRef.current.readyState === 1) {
          // 1: OPEN
          const ws = wsRef.current;
          const channelData = event.inputBuffer.getChannelData(0);
          const buffer = floatTo16BitPCM(channelData);
          audioBytesSentRef.current += buffer.byteLength;
          ws.send(buffer);
        }
      }

      scriptNode.onaudioprocess = onAudioProcess;

      const gainNode = audioContext.createGain();
      gainNode.gain.value = 0;

      audioSource.connect(scriptNode);
      scriptNode.connect(gainNode);
      gainNode.connect(audioContext.destination);

      return () => {
        audioSource.disconnect();
        scriptNode.disconnect();
        gainNode.disconnect();
      };
    } catch (error) {
      captureException(error);
    }
  }, [audioSource, audioContext]);

  return [readyState, lastResponse];
}

// Taken from here: http://watson-developer-cloud.github.io/speech-javascript-sdk/v0.7.5/webaudio-l16-stream.js.html
function floatTo16BitPCM(input: Float32Array): ArrayBuffer {
  const output = new DataView(new ArrayBuffer(input.length * 2)); // length is in bytes (8-bit), so *2 to get 16-bit length
  for (let i = 0; i < input.length; i++) {
    let multiplier = input[i] < 0 ? 0x8000 : 0x7fff; // 16-bit signed range is -32768 to 32767
    output.setInt16(i * 2, (input[i] * multiplier) | 0, true); // index, value, little edian
  }
  return output.buffer;
}
