vision-career/vacancies/main/vector_store.py

144 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from qdrant_client import models
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Filter
from vacancies.main.models import VacancyFeatures
# client = QdrantClient(path="./embeddings")
client = QdrantClient(url="http://localhost:6333")
FEATURE_NAMES = [
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
]
weights = {
"job_title": 10,
"employment_type": 2,
"work_format": 2,
"experience": 3,
"position_level": 5,
"industry": 4,
"tech_stack": 5,
"location": 2,
"salary_range": 2,
"languages": 2,
"education": 1,
"schedule": 1,
"additional_requirements": 1,
}
vectors_config = {
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
}
if not client.collection_exists("vacancies"):
client.create_collection(
collection_name="vacancies",
vectors_config=vectors_config
)
if not client.collection_exists("cvs"):
client.create_collection(
collection_name="cvs",
vectors_config=vectors_config
)
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
def _prepare_texts(features):
"""Prepare texts for each feature from features dict."""
texts = {}
for name in FEATURE_NAMES:
value = features.get(name)
if isinstance(value, list):
text = " ".join(value) if value else ""
else:
text = str(value) if value else ""
texts[name] = text
return texts
def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
"""Add vectors for a vacancy based on its features."""
texts = _prepare_texts(features)
vectors = {}
for name, text in texts.items():
vectors[name] = [0.0] * 3072
if text:
vec = embedding.embed_query(text)
vectors[name] = vec
client.upsert(
collection_name=collection_name,
points=[
models.PointStruct(
id=_id,
vector=vectors,
payload=payload,
)
]
)
def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
vectors = client.retrieve(
collection_name="cvs",
ids=[cv_id],
with_vectors=True,
)[0].vector
max_similarities = {}
vacancies_content = {}
for name, vec in vectors.items():
if any(v != 0 for v in vec):
results = client.query_points(
collection_name="vacancies",
query=vec,
using=name,
limit=1000,
with_payload=True,
query_filter=query_filter,
)
for res in results.points:
vid = res.id
sim = res.score
if vid not in max_similarities:
max_similarities[vid] = {}
max_similarities[vid][name] = sim
if vid not in vacancies_content:
vacancies_content[vid] = res.payload["content"]
scored = []
for vid, feature_sims in max_similarities.items():
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[0]["id"], scored[0]["content"]
def extract_features(content: str) -> VacancyFeatures:
prompt = f"""
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
Features:
- job_title: Должность (e.g., DevOps, Python программист)
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
- position_level: Уровень позиции (e.g., Junior, Senior)
- industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
- tech_stack: Технологический стек / Ключевые навыки (list of strings)
- location: География (e.g., Москва, Россия)
- salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
- languages: Языки (list of strings, e.g., ["Русский", "Английский"])
- education: Образование (e.g., Высшее, Среднее специальное)
- schedule: График работы (e.g., Полный день, Сменный)
- additional_requirements: Дополнительные предпочтения / требования (list of strings)
Vacancy content:
{content}
"""
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal")
structured_llm = openai_client.with_structured_output(VacancyFeatures)
response = structured_llm.invoke(prompt)
return response