const EMAIL_REGEX = /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+)/gi;
const URL_REGEX = /(?:https?:\/\/)?(?:(?:www|bit|t|tinyurl)\.[a-z]{2,}|[a-z0-9-]+(?:\.[a-z]{2,})+)(?:\/[^\s]*)?/gi;
const MALFORMED_URL_REGEX = /(?:https?:\/\/|;to\/|\.com\/|\.co\/|\.ly\/|\.be\/)[^\s]*/gi;
const HTML_REGEX = /<[^>]*>/g;
const HTML_ENTITY_REGEX = /&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});/gi;
const SOCIAL_HANDLES_REGEX = /(?:@[\w-]+|#[\w-]+)/g;

interface CleanDescriptionResult {
  description: string;
  extractedEmail: string | null;
}

function decodeHTMLEntities(text: string): string {
  const textarea = document.createElement('textarea');
  return text.replace(HTML_ENTITY_REGEX, (match, entity) => {
    textarea.innerHTML = match;
    const decoded = textarea.value;
    // If decoding failed (returned empty string), return the original match
    return decoded || match;
  });
}

export function cleanDescription(description: string): CleanDescriptionResult {
  let cleanedDescription = description;
  let extractedEmail: string | null = null;

  // Extract first email if found
  const emailMatch = description.match(EMAIL_REGEX);
  if (emailMatch) {
    extractedEmail = emailMatch[0].toLowerCase();
    // Remove all emails from description
    cleanedDescription = cleanedDescription.replace(EMAIL_REGEX, '');
  }

  // Remove URLs and malformed URLs
  cleanedDescription = cleanedDescription
    .replace(URL_REGEX, '')
    .replace(MALFORMED_URL_REGEX, '');

  // Remove HTML tags
  cleanedDescription = cleanedDescription.replace(HTML_REGEX, '');

  // Remove social media handles
  cleanedDescription = cleanedDescription.replace(SOCIAL_HANDLES_REGEX, '');

  // Decode HTML entities and Unicode escapes
  cleanedDescription = decodeHTMLEntities(cleanedDescription);

  // Additional cleanup
  cleanedDescription = cleanedDescription
    // Remove common spam/promo phrases
    .replace(/(?:submit|send|dm|message|follow|contact)(?:\s+(?:your?|us|me|tracks?|music|demos?|here))?\s*(?:@|at|to|via|through)\s*[^\n.]*/gi, '')
    // Remove any remaining semicolon patterns that might be malformed URLs
    .replace(/;[^\s;]{2,}/g, '')
    // Remove excessive punctuation
    .replace(/([!?,.]){2,}/g, '$1')
    // Remove excessive whitespace and normalize line breaks
    .replace(/\s+/g, ' ')
    .replace(/\n\s*\n/g, '\n')
    .trim();

  return {
    description: cleanedDescription,
    extractedEmail
  };
}