vision-career/vacancies/main/vector_store.py

98 lines
3.1 KiB
Python

from qdrant_client import models
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Filter
# client = QdrantClient(path="./embeddings")
client = QdrantClient(url="http://localhost:6333")
FEATURE_NAMES = [
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
]
vectors_config = {
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
}
collection_name = "vacancies"
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=vectors_config
)
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
def _prepare_texts(features):
"""Prepare texts for each feature from features dict."""
texts = {}
for name in FEATURE_NAMES:
value = features.get(name)
if isinstance(value, list):
text = " ".join(value) if value else ""
else:
text = str(value) if value else ""
texts[name] = text
return texts
def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
"""Add vectors for a vacancy based on its features."""
texts = _prepare_texts(features)
vectors = {}
for name, text in texts.items():
vectors[name] = [0.0] * 3072
if text:
vec = embedding.embed_query(text)
vectors[name] = vec
client.upsert(
collection_name=collection_name,
points=[
models.PointStruct(
id=vacancy_id,
vector=vectors,
payload=payload,
)
]
)
def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]:
"""Search vacancies using sum of max similarities.
For each feature, compute similarities, then for each vacancy, take max per feature, sum.
Return top vacancies.
"""
texts = _prepare_texts(query_features)
vectors = {}
for name, text in texts.items():
vectors[name] = [0.0] * 3072
if text:
vec = embedding.embed_query(text)
vectors[name] = vec
max_similarities = {}
for name, vec in vectors.items():
if any(v != 0 for v in vec):
results = client.search(
collection_name=collection_name,
query_vector=(name, vec),
limit=1000,
with_payload=True,
query_filter=query_filter,
)
for res in results:
vid = res.id
sim = res.score
if vid not in max_similarities:
max_similarities[vid] = {}
max_similarities[vid][name] = sim
scored = []
for vid, feature_sims in max_similarities.items():
total = sum(feature_sims.values())
scored.append({"id": vid, "score": total})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[0]["id"]