slow-reader/backend/main.py

"""Main FastAPI application for Gentle Momentum Reader."""

import asyncio
import json
import re
from typing import Dict, List
from urllib.parse import urlparse

import nltk
import requests
import textstat
from bs4 import BeautifulSoup
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from readability import Document

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    nltk.download("punkt_tab")

app = FastAPI(title="Gentle Momentum Reader")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class TextRequest(BaseModel):
    """Request model for text analysis."""

    text: str
    base_speed: float = 200.0  # words per minute


class URLRequest(BaseModel):
    """Request model for URL article extraction."""

    url: str


class PaceCalculator:
    """Calculate optimal reading pace based on text analysis."""

    def __init__(self, base_wpm: float = 200.0):
        self.base_wpm = float(base_wpm)  # Ensure it's always a float
        self.complex_words = set()

    def analyze_word_complexity(self, word: str) -> float:
        """Analyze individual word complexity."""
        word_clean = re.sub(r'[^\w]', '', word.lower())

        if len(word_clean) <= 3:
            return -0.05  # Simple words 5% faster
        elif len(word_clean) >= 8:
            return 0.10   # Complex words 10% slower
        elif word_clean in ["the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", "one", "our", "out", "day", "get", "has", "him", "his", "how", "its", "new", "now", "old", "see", "two", "way", "who", "boy", "did", "man", "run", "say", "she", "too", "use"]:
            return -0.05  # Common words faster

        # Check for technical terms (contains digits, capital letters mid-word)
        if re.search(r'\d', word) or re.search(r'[A-Z]', word[1:]):
            return 0.10

        return 0.0

    def analyze_sentence_complexity(self, sentence: str) -> float:
        """Analyze sentence complexity."""
        words = sentence.split()
        word_count = len(words)

        if word_count > 20:
            return 0.05  # Long sentences 5% slower
        elif word_count < 8:
            return -0.05  # Short sentences 5% faster

        return 0.0

    def analyze_content_type(self, text: str) -> float:
        """Analyze content type for base adjustment."""
        # Simple heuristics for content type
        technical_keywords = ["algorithm", "function", "variable", "parameter", "implementation", "configuration"]
        academic_keywords = ["research", "study", "analysis", "methodology", "hypothesis", "conclusion"]

        text_lower = text.lower()

        if any(keyword in text_lower for keyword in technical_keywords):
            return 0.15  # Technical content 15% slower
        elif any(keyword in text_lower for keyword in academic_keywords):
            return 0.10  # Academic content 10% slower

        return 0.0

    def get_suggested_wpm(self, text: str) -> Dict[str, float]:
        """Suggest optimal WPM based on text difficulty using textstat."""

        # Get various readability scores
        flesch_reading_ease = textstat.flesch_reading_ease(text)
        flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
        gunning_fog = textstat.gunning_fog(text)

        # Base WPM suggestions based on reading level
        # Average adult reads 200-250 WPM for normal text
        if flesch_reading_ease >= 90:  # Very Easy (5th grade)
            suggested_wpm = 280
            difficulty = "Very Easy"
        elif flesch_reading_ease >= 80:  # Easy (6th grade)
            suggested_wpm = 250
            difficulty = "Easy"
        elif flesch_reading_ease >= 70:  # Fairly Easy (7th grade)
            suggested_wpm = 220
            difficulty = "Fairly Easy"
        elif flesch_reading_ease >= 60:  # Standard (8th-9th grade)
            suggested_wpm = 200
            difficulty = "Standard"
        elif flesch_reading_ease >= 50:  # Fairly Difficult (10th-12th grade)
            suggested_wpm = 180
            difficulty = "Fairly Difficult"
        elif flesch_reading_ease >= 30:  # Difficult (college level)
            suggested_wpm = 160
            difficulty = "Difficult"
        else:  # Very Difficult (graduate level)
            suggested_wpm = 140
            difficulty = "Very Difficult"

        # Adjust based on Gunning Fog (sentence complexity)
        if gunning_fog > 16:  # Graduate level
            suggested_wpm *= 0.85
        elif gunning_fog > 13:  # College level
            suggested_wpm *= 0.9
        elif gunning_fog < 8:  # Easy reading
            suggested_wpm *= 1.1

        return {
            "suggested_wpm": round(suggested_wpm),
            "difficulty_level": difficulty,
            "flesch_reading_ease": round(flesch_reading_ease, 1),
            "flesch_kincaid_grade": round(flesch_kincaid_grade, 1),
            "gunning_fog": round(gunning_fog, 1),
            "reading_time_minutes": round(len(text.split()) / suggested_wpm, 1)
        }

    def calculate_reading_speed(self, text: str) -> Dict[str, float]:
        """Calculate reading speed adjustments for text."""
        sentences = nltk.sent_tokenize(text)

        total_adjustment = 0.0
        sentence_adjustments = []

        content_type_adjustment = self.analyze_content_type(text)

        for sentence in sentences:
            sentence_adj = self.analyze_sentence_complexity(sentence)
            words = sentence.split()

            word_adjustments = []
            for word in words:
                word_adj = self.analyze_word_complexity(word)
                word_adjustments.append(word_adj)

            avg_word_adj = sum(word_adjustments) / len(word_adjustments) if word_adjustments else 0.0
            final_sentence_adj = sentence_adj + avg_word_adj + content_type_adjustment

            sentence_adjustments.append({
                "text": sentence,
                "adjustment": final_sentence_adj,
                "word_count": len(words)
            })

            total_adjustment += final_sentence_adj

        avg_adjustment = total_adjustment / len(sentences) if sentences else 0.0
        final_wpm = self.base_wpm * (1 + avg_adjustment)

        # Get text difficulty suggestion
        suggestion = self.get_suggested_wpm(text)

        return {
            "base_wpm": self.base_wpm,
            "adjustment_factor": avg_adjustment,
            "final_wpm": final_wpm,
            "sentences": sentence_adjustments,
            "text_difficulty": suggestion
        }


class ReadingSession:
    """Manage individual reading session state."""

    def __init__(self, websocket: WebSocket, text: str, analysis: Dict):
        self.websocket = websocket
        self.words = text.split()
        self.analysis = analysis
        self.current_word_index = -1
        self.is_paused = False
        self.is_stopped = False
        self.words_per_minute = analysis["final_wpm"]
        self.seconds_per_word = 60.0 / self.words_per_minute
        self.session_task = None


class ConnectionManager:
    """Manage WebSocket connections and reading sessions."""

    def __init__(self):
        self.active_connections: List[WebSocket] = []
        self.sessions: Dict[WebSocket, ReadingSession] = {}

    async def connect(self, websocket: WebSocket):
        await websocket.accept()
        self.active_connections.append(websocket)

    def disconnect(self, websocket: WebSocket):
        if websocket in self.active_connections:
            self.active_connections.remove(websocket)
        if websocket in self.sessions:
            session = self.sessions[websocket]
            session.is_stopped = True
            if session.session_task:
                session.session_task.cancel()
            del self.sessions[websocket]


manager = ConnectionManager()


def extract_article_from_url(url: str) -> Dict[str, str]:
    """Extract article content from URL using readability-lxml."""

    # Validate URL
    parsed = urlparse(url)
    if not parsed.scheme or not parsed.netloc:
        raise ValueError("Invalid URL format")

    # Add protocol if missing
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url

    try:
        # Fetch the page
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        # Method 1: Try readability-lxml first (most reliable for articles)
        doc = Document(response.text)
        title = doc.title()
        content_html = doc.summary()

        if content_html:
            # Parse the extracted HTML to get clean text
            soup = BeautifulSoup(content_html, 'html.parser')

            # Get all paragraph text
            paragraphs = soup.find_all(['p', 'div', 'span'])
            text_parts = []

            for p in paragraphs:
                p_text = p.get_text().strip()
                if len(p_text) > 20:  # Filter out short snippets
                    text_parts.append(p_text)

            text = '\n\n'.join(text_parts)

            if text and len(text.strip()) > 100:
                return {
                    "title": title or "Article",
                    "text": clean_article_text(text),
                    "url": url,
                    "method": "readability"
                }

        # Method 2: Fallback to manual BeautifulSoup extraction
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unwanted elements
        for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside', 'advertisement']):
            element.decompose()

        # Try common article selectors
        article_selectors = [
            'article',
            '[role="main"]',
            '.article-content',
            '.post-content',
            '.entry-content',
            '.content',
            '#content',
            '.story-body',
            '.article-body'
        ]

        title = ""
        text = ""

        # Extract title
        title_selectors = ['h1', 'title', '.headline', '.article-title']
        for selector in title_selectors:
            title_elem = soup.select_one(selector)
            if title_elem:
                title = title_elem.get_text().strip()
                break

        # Extract article text
        for selector in article_selectors:
            article_elem = soup.select_one(selector)
            if article_elem:
                # Get all paragraph text
                paragraphs = article_elem.find_all(['p', 'div'], recursive=True)
                text_parts = []

                for p in paragraphs:
                    p_text = p.get_text().strip()
                    if len(p_text) > 20:  # Filter out short snippets
                        text_parts.append(p_text)

                text = '\n\n'.join(text_parts)
                break

        # Fallback: get all paragraph text from the page
        if not text or len(text.strip()) < 100:
            paragraphs = soup.find_all('p')
            text_parts = []

            for p in paragraphs:
                p_text = p.get_text().strip()
                if len(p_text) > 30:  # Longer threshold for fallback
                    text_parts.append(p_text)

            text = '\n\n'.join(text_parts[:20])  # Limit to first 20 paragraphs

        if text and len(text.strip()) > 100:
            return {
                "title": title or "Article",
                "text": clean_article_text(text),
                "url": url,
                "method": "beautifulsoup"
            }

    except Exception as e:
        raise ValueError(f"Failed to extract article: {str(e)}")

    raise ValueError("Could not extract readable content from this URL")


def clean_article_text(text: str) -> str:
    """Clean and format extracted article text."""

    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove excessive line breaks
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)

    # Remove common web artifacts
    text = re.sub(r'(Advertisement|ADVERTISEMENT|Subscribe|Newsletter)', '', text, flags=re.IGNORECASE)

    # Fix sentence spacing
    text = re.sub(r'\.(\w)', r'. \1', text)

    # Remove lines that are too short (likely navigation/metadata)
    lines = text.split('\n')
    cleaned_lines = []

    for line in lines:
        line = line.strip()
        if len(line) > 20 and not re.match(r'^(By |Share |Tweet |Email)', line):
            cleaned_lines.append(line)

    return '\n\n'.join(cleaned_lines).strip()


@app.post("/analyze-text")
async def analyze_text(request: TextRequest):
    """Analyze text and return reading pace calculations."""
    calculator = PaceCalculator(request.base_speed)
    analysis = calculator.calculate_reading_speed(request.text)

    return {
        "analysis": analysis,
        "word_count": len(request.text.split()),
        "estimated_time_minutes": len(request.text.split()) / analysis["final_wpm"]
    }


@app.post("/extract-article")
async def extract_article(request: URLRequest):
    """Extract article content from URL."""
    try:
        result = extract_article_from_url(request.url)
        word_count = len(result["text"].split())

        return {
            "title": result["title"],
            "text": result["text"],
            "url": result["url"],
            "word_count": word_count,
            "extraction_method": result["method"],
            "success": True
        }

    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")


@app.websocket("/ws/reading-session")
async def websocket_endpoint(websocket: WebSocket):
    """WebSocket endpoint for real-time reading sessions."""
    await manager.connect(websocket)

    try:
        while True:
            data = await websocket.receive_text()
            message = json.loads(data)

            if message["type"] == "start_reading":
                text = message["text"]
                base_speed = float(message.get("base_speed", 200.0))

                calculator = PaceCalculator(base_speed)
                analysis = calculator.calculate_reading_speed(text)

                # Create session
                session = ReadingSession(websocket, text, analysis)
                manager.sessions[websocket] = session

                # Send analysis back
                await websocket.send_text(json.dumps({
                    "type": "analysis_complete",
                    "analysis": analysis,
                    "total_words": len(session.words)
                }))

                # Start reading session as background task
                session.session_task = asyncio.create_task(
                    start_reading_session(session)
                )

            elif message["type"] == "pause":
                if websocket in manager.sessions:
                    session = manager.sessions[websocket]
                    session.is_paused = True
                    await websocket.send_text(json.dumps({
                        "type": "paused"
                    }))

            elif message["type"] == "resume":
                if websocket in manager.sessions:
                    session = manager.sessions[websocket]
                    session.is_paused = False
                    await websocket.send_text(json.dumps({
                        "type": "resumed"
                    }))

            elif message["type"] == "stop":
                if websocket in manager.sessions:
                    session = manager.sessions[websocket]
                    session.is_stopped = True
                    if session.session_task:
                        session.session_task.cancel()
                await websocket.send_text(json.dumps({
                    "type": "stopped"
                }))
                break

    except WebSocketDisconnect:
        manager.disconnect(websocket)


async def start_reading_session(session: ReadingSession):
    """Start the reading session with progressive text reveal."""
    try:
        for i, word in enumerate(session.words):
            if session.is_stopped:
                break

            # Wait while paused
            while session.is_paused and not session.is_stopped:
                await asyncio.sleep(0.1)

            if session.is_stopped:
                break

            session.current_word_index = i

            # Send word reveal - this reveals the word progressively
            await session.websocket.send_text(json.dumps({
                "type": "word_reveal",
                "word_index": i,
                "word": word,
                "revealed_words": session.words[:i+1],  # All words up to current
                "progress": (i + 1) / len(session.words)
            }))

            # Wait for the calculated time
            await asyncio.sleep(session.seconds_per_word)

        # Session complete
        if not session.is_stopped:
            await session.websocket.send_text(json.dumps({
                "type": "session_complete"
            }))

    except (WebSocketDisconnect, asyncio.CancelledError):
        pass


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)