vision-career/vacancies/main/vector_store.py

121 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from qdrant_client import models
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Filter
from vacancies.main.models import VacancyFeatures
client = QdrantClient(path="./embeddings")
#client = QdrantClient(url="http://localhost:6333")
FEATURE_NAMES = [
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
]
vectors_config = {
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
}
collection_name = "vacancies"
if not client.collection_exists(collection_name):
client.create_collection(
collection_name=collection_name,
vectors_config=vectors_config
)
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
def _prepare_texts(features):
"""Prepare texts for each feature from features dict."""
texts = {}
for name in FEATURE_NAMES:
value = features.get(name)
if isinstance(value, list):
text = " ".join(value) if value else ""
else:
text = str(value) if value else ""
texts[name] = text
return texts
def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
"""Add vectors for a vacancy based on its features."""
texts = _prepare_texts(features)
vectors = {}
for name, text in texts.items():
vectors[name] = [0.0] * 3072
if text:
vec = embedding.embed_query(text)
vectors[name] = vec
client.upsert(
collection_name=collection_name,
points=[
models.PointStruct(
id=vacancy_id,
vector=vectors,
payload=payload,
)
]
)
def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]:
texts = _prepare_texts(query_features)
vectors = {}
for name, text in texts.items():
vectors[name] = [0.0] * 3072
if text:
vec = embedding.embed_query(text)
vectors[name] = vec
max_similarities = {}
for name, vec in vectors.items():
if any(v != 0 for v in vec):
results = client.search(
collection_name=collection_name,
query_vector=(name, vec),
limit=1000,
with_payload=True,
query_filter=query_filter,
)
for res in results:
vid = res.id
sim = res.score
if vid not in max_similarities:
max_similarities[vid] = {}
max_similarities[vid][name] = sim
scored = []
for vid, feature_sims in max_similarities.items():
total = sum(feature_sims.values())
scored.append({"id": vid, "score": total})
scored.sort(key=lambda x: x["score"], reverse=True)
return scored[0]["id"]
def extract_vacancy_features(content: str) -> VacancyFeatures:
prompt = f"""
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
Features:
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
- position_level: Уровень позиции (e.g., Junior, Senior)
- industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
- tech_stack: Технологический стек / Ключевые навыки (list of strings)
- location: География (e.g., Москва, Россия)
- salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
- languages: Языки (list of strings, e.g., ["Русский", "Английский"])
- education: Образование (e.g., Высшее, Среднее специальное)
- schedule: График работы (e.g., Полный день, Сменный)
- additional_requirements: Дополнительные предпочтения / требования (list of strings)
Vacancy content:
{content}
"""
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal")
structured_llm = openai_client.with_structured_output(VacancyFeatures)
response = structured_llm.invoke(prompt)
return response