from qdrant_client import models from langchain_openai import OpenAIEmbeddings from qdrant_client import QdrantClient from qdrant_client.models import Filter # client = QdrantClient(path="./embeddings") client = QdrantClient(url="http://localhost:6333") FEATURE_NAMES = [ "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack", "location", "salary_range", "languages", "education", "schedule", "additional_requirements" ] vectors_config = { name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES } collection_name = "vacancies" if not client.collection_exists(collection_name): client.create_collection( collection_name=collection_name, vectors_config=vectors_config ) embedding = OpenAIEmbeddings(model="text-embedding-3-large") def _prepare_texts(features): """Prepare texts for each feature from features dict.""" texts = {} for name in FEATURE_NAMES: value = features.get(name) if isinstance(value, list): text = " ".join(value) if value else "" else: text = str(value) if value else "" texts[name] = text return texts def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict): """Add vectors for a vacancy based on its features.""" texts = _prepare_texts(features) vectors = {} for name, text in texts.items(): vectors[name] = [0.0] * 3072 if text: vec = embedding.embed_query(text) vectors[name] = vec client.upsert( collection_name=collection_name, points=[ models.PointStruct( id=vacancy_id, vector=vectors, payload=payload, ) ] ) def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]: """Search vacancies using sum of max similarities. For each feature, compute similarities, then for each vacancy, take max per feature, sum. Return top vacancies. """ texts = _prepare_texts(query_features) vectors = {} for name, text in texts.items(): vectors[name] = [0.0] * 3072 if text: vec = embedding.embed_query(text) vectors[name] = vec max_similarities = {} for name, vec in vectors.items(): if any(v != 0 for v in vec): results = client.search( collection_name=collection_name, query_vector=(name, vec), limit=1000, with_payload=True, query_filter=query_filter, ) for res in results: vid = res.id sim = res.score if vid not in max_similarities: max_similarities[vid] = {} max_similarities[vid][name] = sim scored = [] for vid, feature_sims in max_similarities.items(): total = sum(feature_sims.values()) scored.append({"id": vid, "score": total}) scored.sort(key=lambda x: x["score"], reverse=True) return scored[0]["id"]