137 lines
4.5 KiB
Python
137 lines
4.5 KiB
Python
import os
|
|
import httpx
|
|
import asyncio
|
|
import io
|
|
from typing import Optional, Dict, Any
|
|
from dotenv import load_dotenv
|
|
import pdfplumber
|
|
|
|
load_dotenv()
|
|
|
|
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
|
|
|
|
|
class OpenRouterClient:
|
|
def __init__(self, api_key: Optional[str] = None):
|
|
self.api_key = api_key or OPENROUTER_API_KEY
|
|
if not self.api_key:
|
|
raise ValueError("OPENROUTER_API_KEY not set in environment")
|
|
|
|
self.headers = {
|
|
"Authorization": f"Bearer {self.api_key}",
|
|
"HTTP-Referer": "http://localhost:8000",
|
|
"X-Title": "RAG AI Assistant",
|
|
}
|
|
|
|
async def parse_cv_from_pdf(
|
|
self,
|
|
pdf_bytes: bytes,
|
|
model: str = "qwen/qwen-2.5-72b-instruct",
|
|
max_retries: int = 3
|
|
) -> Dict[str, Any]:
|
|
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
|
|
text = "\n".join([page.extract_text() or "" for page in pdf.pages])
|
|
|
|
if not text.strip():
|
|
raise ValueError("Could not extract text from PDF")
|
|
|
|
prompt = f"""Analyze this CV/Resume and extract the following information in JSON format.
|
|
|
|
CV Text:
|
|
{text}
|
|
|
|
Please extract and return ONLY a JSON object with these fields (use "NONE" for missing information):
|
|
{{
|
|
"name": "Full name",
|
|
"email": "Email address",
|
|
"position": "Current or desired job title",
|
|
"competencies": "Key competencies and areas of expertise",
|
|
"experience": "Work experience summary",
|
|
"skills": "Technical and soft skills (comma-separated)",
|
|
"country": "Country",
|
|
"languages": "Languages spoken (comma-separated)",
|
|
"employment_format": "Preferred employment format (remote/office/hybrid)",
|
|
"rate": "Salary expectations or rate",
|
|
"relocation": "Relocation preferences"
|
|
}}
|
|
|
|
Important:
|
|
- If you can't find an entity, use "NONE"
|
|
- Return ONLY the JSON object, no additional text or markdown
|
|
"""
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
response = await client.post(
|
|
f"{OPENROUTER_BASE_URL}/chat/completions",
|
|
headers=self.headers,
|
|
json={
|
|
"model": model,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
}
|
|
],
|
|
"temperature": 0.1,
|
|
}
|
|
)
|
|
|
|
if response.status_code == 429:
|
|
if attempt < max_retries - 1:
|
|
wait_time = (2 ** attempt) * 5
|
|
await asyncio.sleep(wait_time)
|
|
continue
|
|
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
content = result["choices"][0]["message"]["content"]
|
|
|
|
import json
|
|
content = content.strip()
|
|
if content.startswith("```json"):
|
|
content = content[7:]
|
|
if content.startswith("```"):
|
|
content = content[3:]
|
|
if content.endswith("```"):
|
|
content = content[:-3]
|
|
content = content.strip()
|
|
|
|
return json.loads(content)
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
if e.response.status_code == 429 and attempt < max_retries - 1:
|
|
wait_time = (2 ** attempt) * 5
|
|
await asyncio.sleep(wait_time)
|
|
continue
|
|
raise
|
|
|
|
raise Exception("Rate limit exceeded after retries")
|
|
|
|
async def chat_completion(
|
|
self,
|
|
messages: list,
|
|
model: str = "qwen/qwen3-8b",
|
|
temperature: float = 0.7
|
|
) -> str:
|
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
|
response = await client.post(
|
|
f"{OPENROUTER_BASE_URL}/chat/completions",
|
|
headers=self.headers,
|
|
json={
|
|
"model": model,
|
|
"messages": messages,
|
|
"temperature": temperature,
|
|
}
|
|
)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
return result["choices"][0]["message"]["content"]
|
|
|
|
|
|
openrouter_client = OpenRouterClient()
|