vision-career/vacancies/main/vector_store.py

from qdrant_client import models
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Filter
from vacancies.main.models import VacancyFeatures

client = QdrantClient(path="./embeddings")
#client = QdrantClient(url="http://localhost:6333")

FEATURE_NAMES = [
    "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
    "location", "salary_range", "languages", "education", "schedule", "additional_requirements"
]

vectors_config = {
    name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
}

collection_name = "vacancies"
if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=vectors_config
    )

embedding = OpenAIEmbeddings(model="text-embedding-3-large")

def _prepare_texts(features):
    """Prepare texts for each feature from features dict."""
    texts = {}
    for name in FEATURE_NAMES:
        value = features.get(name)
        if isinstance(value, list):
            text = " ".join(value) if value else ""
        else:
            text = str(value) if value else ""
        texts[name] = text
    return texts


def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
    """Add vectors for a vacancy based on its features."""
    texts = _prepare_texts(features)
    vectors = {}
    for name, text in texts.items():
        vectors[name] = [0.0] * 3072
        if text:
            vec = embedding.embed_query(text)
            vectors[name] = vec
    client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=vacancy_id,
                vector=vectors,
                payload=payload,
            )
        ]
    )


def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]:
    texts = _prepare_texts(query_features)
    vectors = {}
    for name, text in texts.items():
        vectors[name] = [0.0] * 3072
        if text:
            vec = embedding.embed_query(text)
            vectors[name] = vec

    max_similarities = {}
    for name, vec in vectors.items():
        if any(v != 0 for v in vec):
            results = client.search(
                collection_name=collection_name,
                query_vector=(name, vec),
                limit=1000,
                with_payload=True,
                query_filter=query_filter,
            )
            for res in results:
                vid = res.id
                sim = res.score
                if vid not in max_similarities:
                    max_similarities[vid] = {}
                max_similarities[vid][name] = sim

    scored = []
    for vid, feature_sims in max_similarities.items():
        total = sum(feature_sims.values())
        scored.append({"id": vid, "score": total})

    scored.sort(key=lambda x: x["score"], reverse=True)
    return scored[0]["id"]


def extract_vacancy_features(content: str) -> VacancyFeatures:
    prompt = f"""
    Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
    Features:
    - employment_type: Тип занятости (e.g., Полная занятость, Частичная)
    - work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
    - experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
    - position_level: Уровень позиции (e.g., Junior, Senior)
    - industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
    - tech_stack: Технологический стек / Ключевые навыки (list of strings)
    - location: География (e.g., Москва, Россия)
    - salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
    - languages: Языки (list of strings, e.g., ["Русский", "Английский"])
    - education: Образование (e.g., Высшее, Среднее специальное)
    - schedule: График работы (e.g., Полный день, Сменный)
    - additional_requirements: Дополнительные предпочтения / требования (list of strings)
    Vacancy content:
    {content}
    """
    openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal")
    structured_llm = openai_client.with_structured_output(VacancyFeatures)
    response = structured_llm.invoke(prompt)
    return response