import flattenDeep from 'lodash/flattenDeep';

/**
 * Splits a string into an array of sentences.
 *
 * Inspired by:
 * https://stackoverflow.com/questions/11761563/javascript-regexp-for-splitting-text-into-sentences-and-keeping-the-delimiter
 * https://stackoverflow.com/questions/25735644/python-regex-for-splitting-text-into-sentences-sentence-tokenizing
 * https://stackoverflow.com/questions/4576077/python-split-text-on-sentences
 *
 * This method covers most cases, but is not perfect. If this feature becomes core to the product,
 * we may consider incorporating a natural language processor such as:
 * https://github.com/explosion/spaCy
 * https://github.com/nltk/nltk
 *
 * @param {string}
 * @returns {Array<string>} Array of sentences
 */
export default function splitIntoSentences(text: string): Array<string> {
  if (text.includes('\n')) {
    const splitText = text.split('\n').map(t => splitIntoSentences(t));
    return flattenDeep(splitText);
  }
  let splitText = text;
  splitText = splitText.trim();
  /**
   * Remove non-printable characters. The regex uses an octal reference to these characters.
   * 000 - 010 includes the first 9 characters, 011 is the horizontal tab, which will remain.
   * 012 - 037 is the remaining 21 non-printable characters. See http://www.asciitable.com/.
   */
  splitText = splitText.replace(/[\000-\010\012-\037]+/g, '');
  splitText = splitText.replace(/([0-9]+)[.]([0-9]+)/, '$1<period>$2');
  splitText = splitText.replace(
    /(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Inc|Ltd|Jr|Sr|Co)[.]/g,
    '$1<period>'
  );
  splitText = splitText.replace(/Ph\.D\./g, 'Ph<period>D<period>');
  splitText = splitText.replace(/e\.g\./g, 'e<period>g<period>');
  splitText = splitText.replace(/i\.e\./g, 'i<period>e<period>');
  splitText = splitText.replace(/vs\./g, 'vs<period>');
  splitText = splitText.replace(
    /([A-Za-z])[.]([A-Za-z])[.]([A-Za-z])[.]/g,
    '$1<period>$2<period>$3<period>'
  );
  splitText = splitText.replace(
    /([A-Za-z])[.]([A-Za-z])[.]/g,
    '$1<period>$2<period>'
  );
  // Accounts for urls with a subdomain, ex: www.website.com
  splitText = splitText.replace(
    /(\w+)[.](\w+)[.](com|co|net|org|io|gov|me|edu|ai)/g,
    '$1<period>$2<period>$3'
  );
  // Accounts for urls without a subdomain, ex: website.com
  splitText = splitText.replace(
    /[.](com|co|net|org|io|gov|me|edu|ai)/g,
    '<period>$1'
  );
  splitText = splitText.replace(/([.!?][.!?"”'’)]{0,})/g, '$1<stop>');
  splitText = splitText.replace(/<period>/g, '.');
  return splitText
    .split('<stop>')
    .map(t => t.trim())
    .filter(s => s.length > 0);
}
