import asyncio import json import os import logging from typing import AsyncGenerator, Dict, Any, Optional, List import base64 from google.cloud import speech from google.cloud import texttospeech from google.api_core import exceptions import openai from config import config from models import Personality, SCENARIO_PERSONALITIES, GoalItem, Gender logger = logging.getLogger(__name__) class SpeechToTextService: def __init__(self): self.client = speech.SpeechClient() # Get encoding from config encoding_map = { "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16, "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC, "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW, "AMR": speech.RecognitionConfig.AudioEncoding.AMR, "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB, "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS, "MP3": speech.RecognitionConfig.AudioEncoding.MP3, } self.recognition_config = speech.RecognitionConfig( encoding=encoding_map.get(config.SPEECH_ENCODING, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS), sample_rate_hertz=config.SPEECH_SAMPLE_RATE, language_code=config.SPEECH_LANGUAGE_CODE, enable_automatic_punctuation=True, use_enhanced=True, model="latest_long", ) self.streaming_config = speech.StreamingRecognitionConfig( config=self.recognition_config, interim_results=True, single_utterance=False, ) async def transcribe_streaming(self, audio_generator: AsyncGenerator[bytes, None]) -> AsyncGenerator[Dict[str, Any], None]: """Stream audio data to Google Cloud Speech-to-Text and yield transcription results.""" try: async def request_generator(): # First request with config yield speech.StreamingRecognizeRequest(streaming_config=self.streaming_config) # Then audio requests async for chunk in audio_generator: yield speech.StreamingRecognizeRequest(audio_content=chunk) responses = self.client.streaming_recognize(request_generator()) for response in responses: for result in response.results: transcript = result.alternatives[0].transcript is_final = result.is_final yield { "type": "transcription", "transcript": transcript, "is_final": is_final, "confidence": result.alternatives[0].confidence if is_final else 0.0 } except exceptions.GoogleAPICallError as e: yield { "type": "error", "message": f"Speech recognition error: {str(e)}" } class TextToSpeechService: def __init__(self): self.client = texttospeech.TextToSpeechClient() # Gender mapping for Google TTS self.gender_map = { "FEMALE": texttospeech.SsmlVoiceGender.FEMALE, "MALE": texttospeech.SsmlVoiceGender.MALE, "NEUTRAL": texttospeech.SsmlVoiceGender.NEUTRAL, "male": texttospeech.SsmlVoiceGender.MALE, "female": texttospeech.SsmlVoiceGender.FEMALE, } def _get_voice_and_audio_config(self, gender: str, character_name: str = None) -> tuple: """Get appropriate voice and audio configuration based on gender.""" tts_gender = self.gender_map.get(gender, texttospeech.SsmlVoiceGender.FEMALE) character_voice_map = { "Pak Budi": { "name": "id-ID-Chirp3-HD-Charon", "speaking_rate": 0.95, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.MALE, }, "Ibu Sari": { "name": "id-ID-Chirp3-HD-Kore", "speaking_rate": 1.0, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, }, "Mbak Sari": { "name": "id-ID-Chirp3-HD-Zephyr", "speaking_rate": 1.1, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, }, "Adik Kasir": { "name": "id-ID-Chirp3-HD-Aoede", "speaking_rate": 1.05, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, }, "Tetangga Ali": { "name": "id-ID-Chirp3-HD-Puck", "speaking_rate": 1.05, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.MALE, } } gender_voice_fallback = { texttospeech.SsmlVoiceGender.MALE: { "name": "id-ID-Chirp3-HD-Fenrir", "speaking_rate": 1.0, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.MALE, }, texttospeech.SsmlVoiceGender.FEMALE: { "name": "id-ID-Chirp3-HD-Leda", "speaking_rate": 1.0, "pitch": None, "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE, } } config_set = None if character_name and character_name in character_voice_map: config_set = character_voice_map[character_name] logger.info(f"Using character-specific voice for '{character_name}': {config_set['name']}") if not config_set: config_set = gender_voice_fallback.get(tts_gender, gender_voice_fallback[texttospeech.SsmlVoiceGender.FEMALE]) logger.info(f"Using gender fallback voice for {tts_gender}: {config_set['name']}") voice = texttospeech.VoiceSelectionParams( language_code=config.TTS_LANGUAGE_CODE, name=config_set["name"], ssml_gender=config_set["ssml_gender"], ) audio_config_params = { "audio_encoding": texttospeech.AudioEncoding.LINEAR16, "speaking_rate": config_set["speaking_rate"], "effects_profile_id": ['handset-class-device'], } if config_set["pitch"] is not None: audio_config_params["pitch"] = config_set["pitch"] audio_config = texttospeech.AudioConfig(**audio_config_params) return voice, audio_config async def synthesize_speech(self, text: str, gender: str = "female", character_name: str = None) -> bytes: """Convert text to speech using Google Cloud Text-to-Speech with natural, conversational voice.""" try: logger.info(f"TTS synthesize_speech called with text: '{text}', gender: '{gender}', character: '{character_name}'") voice, audio_config = self._get_voice_and_audio_config(gender, character_name) logger.info(f"Using voice: {voice.name}, requested gender: '{gender}', mapped TTS gender: {voice.ssml_gender}") synthesis_input = texttospeech.SynthesisInput(text=text) response = self.client.synthesize_speech( input=synthesis_input, voice=voice, audio_config=audio_config, ) logger.info(f"TTS successful, audio length: {len(response.audio_content)} bytes") return response.audio_content except exceptions.GoogleAPICallError as e: logger.error(f"Text-to-speech error: {str(e)}") raise Exception(f"Text-to-speech error: {str(e)}") class AIConversationService: def __init__(self): self.client = openai.OpenAI(api_key=config.OPENAI_API_KEY) self.model = config.OPENAI_MODEL self.current_personality: Optional[Personality] = None self.conversation_history: List[Dict[str, str]] = [] self.goal_progress: List[GoalItem] = [] def set_personality(self, personality: Personality): """Set the current personality for the conversation.""" self.current_personality = personality # Reset conversation history when personality changes self.conversation_history = [] # Initialize goal progress self.goal_progress = [GoalItem(**item.dict()) for item in personality.goal_items] def reset_conversation(self): """Reset the conversation history.""" self.conversation_history = [] # Reset goal progress if self.current_personality: self.goal_progress = [GoalItem(**item.dict()) for item in self.current_personality.goal_items] def get_personality_for_scenario(self, scenario: str, character_name: str = None) -> Personality: """Get personality based on scenario and optional character name.""" if scenario in SCENARIO_PERSONALITIES: personalities = SCENARIO_PERSONALITIES[scenario] if character_name and character_name in personalities: return personalities[character_name] else: # Return first personality if no specific character requested return list(personalities.values())[0] # Return default personality if scenario not found return Personality( character_type="generic", name="Pak/Bu", tone="friendly", age_range="middle-aged", background="Helpful Indonesian person", typical_phrases=["Halo!", "Apa kabar?", "Bisa saya bantu?"], response_style="Friendly and helpful", location_context="Indonesia", is_helpful=True, is_talkative=True ) async def check_goal_completion(self, user_message: str, ai_response: str) -> bool: """Check if any goals are completed using LLM judge.""" if not self.goal_progress: return False goals_completed = False # Only check goals that aren't already completed incomplete_goals = [g for g in self.goal_progress if not g.completed] if not incomplete_goals: return False logger.info(f"Checking goal completion for user message: '{user_message}'") logger.info(f"Incomplete goals: {[g.description for g in incomplete_goals]}") conversation_context = "" for exchange in self.conversation_history[-3:]: conversation_context += f"User: {exchange['user']}\nAI: {exchange['assistant']}\n" for goal in incomplete_goals: logger.info(f"Checking goal: '{goal.description}'") completion_check = await self._judge_goal_completion( goal, user_message, ai_response, conversation_context ) if completion_check: goal.completed = True goals_completed = True logger.info(f"✅ Goal completed: {goal.description}") else: logger.info(f"❌ Goal not completed: {goal.description}") return goals_completed async def _judge_goal_completion(self, goal: GoalItem, user_message: str, ai_response: str, conversation_context: str) -> bool: """Use LLM to judge if a specific goal was completed.""" try: if "order" in goal.description.lower() or "buy" in goal.description.lower(): judge_prompt = f"""You are a strict judge determining if a specific goal was FULLY completed in a conversation. GOAL TO CHECK: {goal.description} RECENT CONVERSATION CONTEXT: {conversation_context} LATEST EXCHANGE: User: {user_message} AI: {ai_response} CRITICAL RULES FOR ORDERING GOALS: 1. ONLY return "YES" if the user has COMPLETELY finished this exact goal 2. Return "NO" if the goal is partial, incomplete, or just being discussed 3. For "Order [item]" goals: user must explicitly say they want/order that EXACT item with ALL specifications 4. For drink goals: user must specifically mention wanting/ordering a drink 5. Don't mark as complete just because the AI is asking about it Answer ONLY "YES" or "NO":""" else: judge_prompt = f"""You are judging if a conversational goal was completed in a natural small talk scenario. GOAL TO CHECK: {goal.description} RECENT CONVERSATION CONTEXT: {conversation_context} LATEST EXCHANGE: User: {user_message} AI: {ai_response} RULES FOR SMALL TALK GOALS: 1. Return "YES" if the user has naturally accomplished this conversational goal ANYWHERE in the conversation 2. For "Share something about yourself" goals: Look through the ENTIRE conversation for work, family, hobbies, personal interests, financial situation, dreams, etc. 3. For "Ask follow-up questions" goals: user asks questions to continue conversation 4. For "Exchange greetings" goals: user greets or responds to greetings 5. For "Discuss weather/daily life" goals: user talks about weather, daily activities, current events 6. Goals can be completed through natural conversation flow, not just direct statements 7. IMPORTANT: Check the FULL conversation context, not just the latest exchange EXAMPLES: - Goal: "Share something about yourself (work, family, hobbies, etc.)" - User mentions work: "sibuk banget di kantor sering lembur" → YES (work situation) - User mentions finances: "nggak punya duit" → YES (personal finance) - User mentions hobbies: "sukanya ke Afrika" → YES (travel interests) - User mentions dreams: "Belum pernah mimpi aja dulu sih" → YES (personal aspirations) - User just greets: "Baik nih" → NO (just greeting, no personal info) - Goal: "Ask follow-up questions to keep the conversation going" - User: "Mas Ali suka lari juga gak?" → YES (asking follow-up question) - User: "Gimana kabar keluarga?" → YES (asking about family) - User: "Iya" → NO (just responding, not asking) Be reasonable and natural - small talk goals should be completed through normal conversation. SCAN THE ENTIRE CONVERSATION, not just the latest message. Answer ONLY "YES" or "NO":""" response = self.client.chat.completions.create( model=self.model, messages=[{"role": "user", "content": judge_prompt}], max_tokens=5, temperature=0.1, # Low temperature for consistent judging ) result = response.choices[0].message.content.strip().upper() logger.info(f"Goal judge result for '{goal.description}': {result}") return result == "YES" except Exception as e: logger.error(f"Error in goal completion judge: {str(e)}") return False def are_all_goals_completed(self) -> bool: """Check if all goals are completed.""" return all(goal.completed for goal in self.goal_progress) def get_goal_status(self) -> Dict[str, Any]: """Get current goal status.""" return { "scenario_goal": self.current_personality.scenario_goal if self.current_personality else "", "goal_items": [ { "id": goal.id, "description": goal.description, "completed": goal.completed } for goal in self.goal_progress ], "all_completed": self.are_all_goals_completed() } async def get_response(self, user_message: str, context: str = "") -> str: """Get AI response to user message using current personality and conversation history.""" try: # Use current personality or default if not self.current_personality: default_personality = self.get_personality_for_scenario("warung", "pak_budi") self.set_personality(default_personality) system_prompt = self.current_personality.get_system_prompt(context) # Build messages with conversation history messages = [{"role": "system", "content": system_prompt}] # Add conversation history (keep last 15 exchanges for better chitchat context) recent_history = self.conversation_history[-15:] if len(self.conversation_history) > 15 else self.conversation_history for exchange in recent_history: messages.append({"role": "user", "content": exchange["user"]}) messages.append({"role": "assistant", "content": exchange["assistant"]}) # Add current user message messages.append({"role": "user", "content": user_message}) logger.info(f"Sending {len(messages)} messages to AI:") for i, msg in enumerate(messages): if msg["role"] == "system": logger.info(f" {i}: SYSTEM (length: {len(msg['content'])})") else: logger.info(f" {i}: {msg['role'].upper()}: '{msg['content']}'") response = self.client.chat.completions.create( model=self.model, messages=messages, max_tokens=250, temperature=0.7, ) ai_response = response.choices[0].message.content self.conversation_history.append({ "user": user_message, "assistant": ai_response }) await self.check_goal_completion(user_message, ai_response) logger.info(f"Conversation history length: {len(self.conversation_history)}") if len(self.conversation_history) > 0: logger.info(f"Last exchange - User: '{self.conversation_history[-1]['user']}', AI: '{self.conversation_history[-1]['assistant']}'") if self.goal_progress: completed_goals = [g.description for g in self.goal_progress if g.completed] logger.info(f"Completed goals: {completed_goals}") logger.info(f"All goals completed: {self.are_all_goals_completed()}") return ai_response except Exception as e: return f"Maaf, ada error: {str(e)}" class ConversationFlowService: def __init__(self): self.stt_service = SpeechToTextService() self.tts_service = TextToSpeechService() self.ai_service = AIConversationService() def set_scenario_personality(self, scenario: str, character_name: str = None): """Set the personality based on scenario and character.""" personality = self.ai_service.get_personality_for_scenario(scenario, character_name) if not self.ai_service.current_personality or self.ai_service.current_personality.name != personality.name: logger.info(f"Setting new personality: {personality.name}") self.ai_service.set_personality(personality) logger.info("Goal progress initialized for new personality") else: logger.info(f"Keeping existing personality: {personality.name}") async def process_conversation_flow(self, transcribed_text: str, scenario_context: str = "") -> Dict[str, Any]: """Process the complete conversation flow: Text → AI → Speech.""" try: scenario = self.extract_scenario_from_context(scenario_context) if scenario: self.set_scenario_personality(scenario) ai_response = await self.ai_service.get_response(transcribed_text, scenario_context) gender = self.ai_service.current_personality.gender.value if self.ai_service.current_personality else "female" personality_name = self.ai_service.current_personality.name if self.ai_service.current_personality else "Unknown" logger.info(f"Generating TTS for character '{personality_name}' with text: '{ai_response}' and gender: '{gender}'") audio_content = await self.tts_service.synthesize_speech(ai_response, gender, personality_name) logger.info(f"TTS generation successful, audio length: {len(audio_content)} bytes") audio_base64 = base64.b64encode(audio_content).decode('utf-8') goal_status = self.ai_service.get_goal_status() return { "type": "ai_response", "text": ai_response, "audio": audio_base64, "audio_format": "mp3", "character": self.ai_service.current_personality.name if self.ai_service.current_personality else "Unknown", "goal_status": goal_status, "conversation_complete": goal_status.get("all_completed", False) } except Exception as e: return { "type": "error", "message": f"Conversation flow error: {str(e)}" } def extract_scenario_from_context(self, context: str) -> str: """Extract scenario type from context string.""" logger.info(f"Extracting scenario from context: '{context}'") context_lower = context.lower() detected_scenario = None if "coffee_shop" in context_lower or "coffee" in context_lower: detected_scenario = "coffee_shop" elif "warung" in context_lower or "nasi goreng" in context_lower: detected_scenario = "warung" elif "ojek" in context_lower or "mall" in context_lower: detected_scenario = "ojek" elif "alfamart" in context_lower or "indomie" in context_lower: detected_scenario = "alfamart" else: detected_scenario = "warung" # Default to warung logger.info(f"Detected scenario: '{detected_scenario}'") return detected_scenario