"""
OffersExtractor Flask API - Production Ready
Extracts internship offers from PDF files using DeepSeek AI
"""
import os
import json
import time
import tempfile
import logging
from pathlib import Path
from typing import List, Dict, Optional

from flask import Flask, request, jsonify
from flask_cors import CORS
from werkzeug.utils import secure_filename
import fitz  # PyMuPDF
import requests
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Flask app
app = Flask(__name__)

# Configure CORS (will be handled by nginx proxy in production)
CORS(app, resources={r"/*": {"origins": "*"}})

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")
DEEPSEEK_API_URL = "https://api.deepseek.com/chat/completions"
MAX_FILE_SIZE = 15 * 1024 * 1024  # 15MB
ALLOWED_EXTENSIONS = {'pdf'}
UPLOAD_FOLDER = tempfile.gettempdir()


def allowed_file(filename: str) -> bool:
    """Check if file has allowed extension"""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


def get_offers_from_llm(text_content: str) -> List[Dict]:
    """
    Sends text from a single page to the DeepSeek API to get structured JSON offers.
    """
    if not DEEPSEEK_API_KEY:
        raise ValueError("DEEPSEEK_API_KEY not configured")
    
    if not text_content.strip():
        return []

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
    }

    system_prompt = """
    You are a highly precise data extraction assistant. Your task is to find all internship offers from the text and convert them into a structured JSON list.

    Follow these rules STRICTLY:
    1.  An offer is identified by the pattern "SUJET X : [Offer Title]". Use the text after the colon as the `title`.
    2.  The `description` for EACH offer MUST be the unique text found under its "Mission du projet :" heading.
    3.  The `skills` are the list of items found under "Compétences requises :".
    4.  Extract `duration_months` from text like "Durée: 4 mois" or "Duration: 6 months". Convert to integer (e.g., 4, 6).
    5.  Extract `hours_per_week` from text like "40h/semaine" or "35 heures par semaine". Convert to integer (e.g., 40, 35).
    6.  **Payment Rule**: If payment is not mentioned, you MUST set `"is_paid": false`.
    7.  **Date Rule**: If date fields are not mentioned, you MUST OMIT them from the JSON object.
    8.  **Tags Rule**: You MUST generate a list of 3-4 relevant technical keywords for the `tags` field based on the offer's title and skills.
    9.  Return ONLY a valid JSON list `[]`. If you find no offers, return an empty list `[]`.
    
    Example JSON structure:
    [
        {
            "title": "Développeur Full Stack",
            "description": "Mission détaillée...",
            "skills": ["Angular", "Laravel", "MySQL"],
            "duration_months": 6,
            "hours_per_week": 40,
            "is_paid": true,
            "tags": ["Web", "Full Stack", "Angular"]
        }
    ]
    """

    payload = {
        "model": "deepseek-chat",
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text_content},
        ],
        "temperature": 0.0,
        "response_format": {"type": "json_object"}
    }
    
    try:
        response = requests.post(
            DEEPSEEK_API_URL, 
            headers=headers, 
            json=payload, 
            timeout=300
        )
        response.raise_for_status()
        response_json = response.json()
        content_str = response_json['choices'][0]['message']['content']
        
        json_data = json.loads(content_str)
        
        # Handle cases where the LLM might wrap the list in a dictionary
        if isinstance(json_data, list):
            return json_data
        elif isinstance(json_data, dict) and 'offers' in json_data and isinstance(json_data['offers'], list):
            return json_data['offers']
        else:
            return []

    except requests.exceptions.RequestException as e:
        logger.error(f"DeepSeek API request failed: {e}")
        raise RuntimeError(f"API Request Error: {e}")
    except (json.JSONDecodeError, KeyError, IndexError) as e:
        logger.error(f"DeepSeek API response parsing failed: {e}")
        raise RuntimeError(f"API Response Error: {e}")


def extract_offers_from_pdf(pdf_file_path: str) -> List[Dict]:
    """
    Main function to parse a PDF page by page, extract offers, and deduplicate them.
    """
    all_offers_raw = []

    try:
        doc = fitz.open(pdf_file_path)
        total_pages = len(doc)
        
        logger.info(f"Processing PDF with {total_pages} pages")
        
        for page_num in range(total_pages):
            page = doc.load_page(page_num)
            page_text = page.get_text()

            # Skip pages without offer markers
            if "SUJET" not in page_text:
                continue

            logger.info(f"Processing page {page_num + 1}/{total_pages}")
            offers_from_page = get_offers_from_llm(page_text)
            
            if offers_from_page:
                all_offers_raw.extend(offers_from_page)
            
            # Rate limiting for API calls
            time.sleep(1)

        doc.close()

        # Deduplicate offers by description to ensure uniqueness
        unique_offers = []
        seen_descriptions = set()
        for offer in all_offers_raw:
            description = offer.get("description", "").strip()
            if not description or description in seen_descriptions:
                continue
            seen_descriptions.add(description)
            
            # Always set offer_type to "Stage PFE" (this is an internship extractor)
            offer['offer_type'] = 'Stage PFE'
            
            # Ensure duration_months and hours_per_week are integers or null
            if 'duration_months' in offer and offer['duration_months'] is not None:
                try:
                    offer['duration_months'] = int(offer['duration_months'])
                except (ValueError, TypeError):
                    offer['duration_months'] = None
            
            if 'hours_per_week' in offer and offer['hours_per_week'] is not None:
                try:
                    offer['hours_per_week'] = int(offer['hours_per_week'])
                except (ValueError, TypeError):
                    offer['hours_per_week'] = None
            
            unique_offers.append(offer)
            
            # Log the offer to verify offer_type is set
            logger.info(f"Processed offer: '{offer.get('title', 'NO TITLE')}' - offer_type='{offer.get('offer_type', 'MISSING')}'")

        
        logger.info(f"Extracted {len(unique_offers)} unique offers from {len(all_offers_raw)} total")
        return unique_offers

    except Exception as e:
        logger.error(f"PDF extraction failed: {e}")
        raise


# =====================
# API Routes
# =====================

@app.route('/health', methods=['GET'])
def health():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "service": "OffersExtractor",
        "deepseek_configured": bool(DEEPSEEK_API_KEY)
    }), 200


@app.route('/extract_offers', methods=['POST'])
def extract_offers():
    """
    Extract internship offers from uploaded PDF file.
    
    Expected: multipart/form-data with 'pdf' file field
    Returns: {"offers": [...], "count": N}
    """
    # Validate DeepSeek API key
    if not DEEPSEEK_API_KEY:
        return jsonify({
            "error": "DEEPSEEK_API_KEY not configured",
            "message": "Server configuration error"
        }), 500

    # Check if file was uploaded
    if 'pdf' not in request.files:
        return jsonify({
            "error": "No file provided",
            "message": "Please upload a PDF file using the 'pdf' form field"
        }), 400

    file = request.files['pdf']

    # Check if filename is empty
    if file.filename == '':
        return jsonify({
            "error": "No file selected",
            "message": "Empty filename"
        }), 400

    # Validate file type
    if not allowed_file(file.filename):
        return jsonify({
            "error": "Invalid file type",
            "message": "Only PDF files are allowed"
        }), 400

    try:
        # Save file temporarily
        filename = secure_filename(file.filename)
        temp_path = os.path.join(UPLOAD_FOLDER, f"extract_{int(time.time())}_{filename}")
        file.save(temp_path)

        logger.info(f"Processing uploaded file: {filename}")

        # Extract offers
        offers = extract_offers_from_pdf(temp_path)

        # Clean up temporary file
        try:
            os.remove(temp_path)
        except Exception as e:
            logger.warning(f"Failed to delete temp file {temp_path}: {e}")

        # Return response
        return jsonify({
            "offers": offers,
            "count": len(offers)
        }), 200

    except ValueError as e:
        logger.error(f"Validation error: {e}")
        return jsonify({
            "error": str(e),
            "message": "Invalid request"
        }), 400

    except RuntimeError as e:
        logger.error(f"Runtime error: {e}")
        return jsonify({
            "error": str(e),
            "message": "Extraction failed"
        }), 500

    except Exception as e:
        logger.error(f"Unexpected error: {e}", exc_info=True)
        return jsonify({
            "error": "Internal server error",
            "message": str(e)
        }), 500


@app.errorhandler(413)
def request_entity_too_large(error):
    """Handle file too large error"""
    return jsonify({
        "error": "File too large",
        "message": f"Maximum file size is {MAX_FILE_SIZE / (1024 * 1024):.0f}MB"
    }), 413


@app.errorhandler(404)
def not_found(error):
    """Handle 404 errors"""
    return jsonify({
        "error": "Not found",
        "message": "The requested endpoint does not exist"
    }), 404


@app.errorhandler(500)
def internal_error(error):
    """Handle 500 errors"""
    logger.error(f"Internal server error: {error}", exc_info=True)
    return jsonify({
        "error": "Internal server error",
        "message": "An unexpected error occurred"
    }), 500


# =====================
# Application Entry Point
# =====================

if __name__ == '__main__':
    # Development server
    app.run(
        host='0.0.0.0',
        port=5100,
        debug=False  # Set to False in production
    )
