98 lines
3.1 KiB
Python
98 lines
3.1 KiB
Python
from qdrant_client import models
|
|
from langchain_openai import OpenAIEmbeddings
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.models import Filter
|
|
|
|
# client = QdrantClient(path="./embeddings")
|
|
client = QdrantClient(url="http://localhost:6333")
|
|
|
|
FEATURE_NAMES = [
|
|
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
|
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
|
]
|
|
|
|
vectors_config = {
|
|
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
|
}
|
|
|
|
collection_name = "vacancies"
|
|
if not client.collection_exists(collection_name):
|
|
client.create_collection(
|
|
collection_name=collection_name,
|
|
vectors_config=vectors_config
|
|
)
|
|
|
|
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
|
|
|
def _prepare_texts(features):
|
|
"""Prepare texts for each feature from features dict."""
|
|
texts = {}
|
|
for name in FEATURE_NAMES:
|
|
value = features.get(name)
|
|
if isinstance(value, list):
|
|
text = " ".join(value) if value else ""
|
|
else:
|
|
text = str(value) if value else ""
|
|
texts[name] = text
|
|
return texts
|
|
|
|
|
|
def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
|
|
"""Add vectors for a vacancy based on its features."""
|
|
texts = _prepare_texts(features)
|
|
vectors = {}
|
|
for name, text in texts.items():
|
|
vectors[name] = [0.0] * 3072
|
|
if text:
|
|
vec = embedding.embed_query(text)
|
|
vectors[name] = vec
|
|
client.upsert(
|
|
collection_name=collection_name,
|
|
points=[
|
|
models.PointStruct(
|
|
id=vacancy_id,
|
|
vector=vectors,
|
|
payload=payload,
|
|
)
|
|
]
|
|
)
|
|
|
|
|
|
def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]:
|
|
"""Search vacancies using sum of max similarities.
|
|
For each feature, compute similarities, then for each vacancy, take max per feature, sum.
|
|
Return top vacancies.
|
|
"""
|
|
texts = _prepare_texts(query_features)
|
|
vectors = {}
|
|
for name, text in texts.items():
|
|
vectors[name] = [0.0] * 3072
|
|
if text:
|
|
vec = embedding.embed_query(text)
|
|
vectors[name] = vec
|
|
|
|
max_similarities = {}
|
|
for name, vec in vectors.items():
|
|
if any(v != 0 for v in vec):
|
|
results = client.search(
|
|
collection_name=collection_name,
|
|
query_vector=(name, vec),
|
|
limit=1000,
|
|
with_payload=True,
|
|
query_filter=query_filter,
|
|
)
|
|
for res in results:
|
|
vid = res.id
|
|
sim = res.score
|
|
if vid not in max_similarities:
|
|
max_similarities[vid] = {}
|
|
max_similarities[vid][name] = sim
|
|
|
|
scored = []
|
|
for vid, feature_sims in max_similarities.items():
|
|
total = sum(feature_sims.values())
|
|
scored.append({"id": vid, "score": total})
|
|
|
|
scored.sort(key=lambda x: x["score"], reverse=True)
|
|
return scored[0]["id"]
|