121 lines
4.6 KiB
Python
121 lines
4.6 KiB
Python
from qdrant_client import models
|
||
from langchain_openai import OpenAIEmbeddings
|
||
from langchain_openai import ChatOpenAI
|
||
from qdrant_client import QdrantClient
|
||
from qdrant_client.models import Filter
|
||
from vacancies.main.models import VacancyFeatures
|
||
|
||
client = QdrantClient(path="./embeddings")
|
||
#client = QdrantClient(url="http://localhost:6333")
|
||
|
||
FEATURE_NAMES = [
|
||
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
||
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
||
]
|
||
|
||
vectors_config = {
|
||
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
||
}
|
||
|
||
collection_name = "vacancies"
|
||
if not client.collection_exists(collection_name):
|
||
client.create_collection(
|
||
collection_name=collection_name,
|
||
vectors_config=vectors_config
|
||
)
|
||
|
||
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
||
|
||
def _prepare_texts(features):
|
||
"""Prepare texts for each feature from features dict."""
|
||
texts = {}
|
||
for name in FEATURE_NAMES:
|
||
value = features.get(name)
|
||
if isinstance(value, list):
|
||
text = " ".join(value) if value else ""
|
||
else:
|
||
text = str(value) if value else ""
|
||
texts[name] = text
|
||
return texts
|
||
|
||
|
||
def add_vacancy_vectors(vacancy_id: int, features: dict, payload: dict):
|
||
"""Add vectors for a vacancy based on its features."""
|
||
texts = _prepare_texts(features)
|
||
vectors = {}
|
||
for name, text in texts.items():
|
||
vectors[name] = [0.0] * 3072
|
||
if text:
|
||
vec = embedding.embed_query(text)
|
||
vectors[name] = vec
|
||
client.upsert(
|
||
collection_name=collection_name,
|
||
points=[
|
||
models.PointStruct(
|
||
id=vacancy_id,
|
||
vector=vectors,
|
||
payload=payload,
|
||
)
|
||
]
|
||
)
|
||
|
||
|
||
def search_similarities(query_features: dict, query_filter: Filter) -> list[dict]:
|
||
texts = _prepare_texts(query_features)
|
||
vectors = {}
|
||
for name, text in texts.items():
|
||
vectors[name] = [0.0] * 3072
|
||
if text:
|
||
vec = embedding.embed_query(text)
|
||
vectors[name] = vec
|
||
|
||
max_similarities = {}
|
||
for name, vec in vectors.items():
|
||
if any(v != 0 for v in vec):
|
||
results = client.search(
|
||
collection_name=collection_name,
|
||
query_vector=(name, vec),
|
||
limit=1000,
|
||
with_payload=True,
|
||
query_filter=query_filter,
|
||
)
|
||
for res in results:
|
||
vid = res.id
|
||
sim = res.score
|
||
if vid not in max_similarities:
|
||
max_similarities[vid] = {}
|
||
max_similarities[vid][name] = sim
|
||
|
||
scored = []
|
||
for vid, feature_sims in max_similarities.items():
|
||
total = sum(feature_sims.values())
|
||
scored.append({"id": vid, "score": total})
|
||
|
||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||
return scored[0]["id"]
|
||
|
||
|
||
def extract_vacancy_features(content: str) -> VacancyFeatures:
|
||
prompt = f"""
|
||
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
||
Features:
|
||
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
|
||
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
|
||
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
|
||
- position_level: Уровень позиции (e.g., Junior, Senior)
|
||
- industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
|
||
- tech_stack: Технологический стек / Ключевые навыки (list of strings)
|
||
- location: География (e.g., Москва, Россия)
|
||
- salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
|
||
- languages: Языки (list of strings, e.g., ["Русский", "Английский"])
|
||
- education: Образование (e.g., Высшее, Среднее специальное)
|
||
- schedule: График работы (e.g., Полный день, Сменный)
|
||
- additional_requirements: Дополнительные предпочтения / требования (list of strings)
|
||
Vacancy content:
|
||
{content}
|
||
"""
|
||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal")
|
||
structured_llm = openai_client.with_structured_output(VacancyFeatures)
|
||
response = structured_llm.invoke(prompt)
|
||
return response
|