209 lines
7.3 KiB
Python
209 lines
7.3 KiB
Python
from langchain_community.embeddings import DeepInfraEmbeddings
|
||
from langchain_openai import ChatOpenAI
|
||
from qdrant_client import QdrantClient, models
|
||
from qdrant_client.models import Filter, HasIdCondition
|
||
|
||
from vacancies.conf.settings import QDRANT_URL
|
||
from vacancies.main.models import RecommendedVacancy, VacancyFeatures
|
||
|
||
qdrant_client = QdrantClient(url=QDRANT_URL)
|
||
|
||
FEATURE_NAMES = [
|
||
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
||
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
||
]
|
||
|
||
weights = {
|
||
"job_title": 42,
|
||
"employment_type": 5,
|
||
"work_format": 5,
|
||
"experience": 8,
|
||
"position_level": 5,
|
||
"industry": 1,
|
||
"tech_stack": 10,
|
||
"location": 4,
|
||
"salary_range": 10,
|
||
"languages": 4,
|
||
"education": 2,
|
||
"schedule": 2,
|
||
"additional_requirements": 2,
|
||
}
|
||
|
||
vectors_config = {
|
||
name: models.VectorParams(size=4096, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
||
}
|
||
|
||
if not qdrant_client.collection_exists("vacancies"):
|
||
qdrant_client.create_collection(
|
||
collection_name="vacancies",
|
||
vectors_config=vectors_config,
|
||
)
|
||
qdrant_client.create_payload_index(
|
||
collection_name="vacancies",
|
||
field_name="timestamp",
|
||
field_schema="datetime",
|
||
)
|
||
if not qdrant_client.collection_exists("cvs"):
|
||
qdrant_client.create_collection(
|
||
collection_name="cvs",
|
||
vectors_config=vectors_config,
|
||
)
|
||
|
||
embedding = DeepInfraEmbeddings(
|
||
model_id="Qwen/Qwen3-Embedding-8B-batch",
|
||
)
|
||
|
||
def _prepare_texts(features):
|
||
"""Prepare texts for each feature from features dict."""
|
||
texts = {}
|
||
for name in FEATURE_NAMES:
|
||
value = features.get(name)
|
||
if isinstance(value, list):
|
||
text = " ".join(value) if value else ""
|
||
else:
|
||
text = str(value) if value else ""
|
||
texts[name] = text
|
||
return texts
|
||
|
||
|
||
def add_vectors(collection_name: str, _id: int, features: dict, payload: dict):
|
||
"""Add vectors for a vacancy based on its features."""
|
||
texts = _prepare_texts(features)
|
||
vectors = {}
|
||
for name, text in texts.items():
|
||
vectors[name] = [0.0] * 4096
|
||
if text:
|
||
vec = embedding.embed_query(text)
|
||
vectors[name] = vec
|
||
|
||
max_similarities = {}
|
||
for name, vec in vectors.items():
|
||
if any(v != 0 for v in vec):
|
||
results = qdrant_client.query_points(
|
||
collection_name="vacancies",
|
||
query=vec,
|
||
using=name,
|
||
limit=100,
|
||
)
|
||
for res in results.points:
|
||
vid = res.id
|
||
sim = res.score
|
||
if vid not in max_similarities:
|
||
max_similarities[vid] = {}
|
||
max_similarities[vid][name] = sim
|
||
|
||
scored = []
|
||
for vid, feature_sims in max_similarities.items():
|
||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||
scored.append({"id": vid, "score": total})
|
||
|
||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||
if scored and scored[0]["score"] > 80: # threshold
|
||
return
|
||
|
||
qdrant_client.upsert(
|
||
collection_name=collection_name,
|
||
points=[
|
||
models.PointStruct(
|
||
id=_id,
|
||
vector=vectors,
|
||
payload=payload,
|
||
)
|
||
]
|
||
)
|
||
|
||
|
||
def search_similarities(query_filter: Filter, cv_id: int):
|
||
cv = qdrant_client.retrieve(
|
||
collection_name="cvs",
|
||
ids=[cv_id],
|
||
with_vectors=True,
|
||
)[0]
|
||
|
||
max_similarities = {}
|
||
vacancies_content = {}
|
||
for name, vec in cv.vector.items():
|
||
results = qdrant_client.query_points(
|
||
collection_name="vacancies",
|
||
query=vec,
|
||
using=name,
|
||
limit=100000,
|
||
with_payload=True,
|
||
query_filter=query_filter,
|
||
)
|
||
for res in results.points:
|
||
vid = res.id
|
||
sim = res.score
|
||
if vid not in max_similarities:
|
||
max_similarities[vid] = {}
|
||
max_similarities[vid][name] = sim
|
||
if vid not in vacancies_content:
|
||
vacancies_content[vid] = {}
|
||
vacancies_content[vid]["content"] = res.payload["content"]
|
||
vacancies_content[vid]["features_json"] = res.payload["features_json"]
|
||
vacancies_content[vid]["link"] = res.payload["link"]
|
||
|
||
scored = []
|
||
for vid, feature_sims in max_similarities.items():
|
||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||
scored.append({
|
||
"id": vid,
|
||
"score": total,
|
||
"content": vacancies_content[vid]["content"],
|
||
"features_json": vacancies_content[vid]["features_json"],
|
||
"link": vacancies_content[vid]["link"],
|
||
"sims": feature_sims,
|
||
})
|
||
|
||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||
import pprint
|
||
pprint.pprint(scored[:5])
|
||
|
||
return scored[0]["id"], scored[0]["content"], scored[0]["link"]
|
||
|
||
|
||
def extract_features(content: str) -> VacancyFeatures:
|
||
prompt = f"""
|
||
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
||
Features:
|
||
- job_title: Должность (e.g., DevOps, Python программист)
|
||
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
|
||
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
|
||
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
|
||
- position_level: Уровень позиции (e.g., Junior, Senior)
|
||
- industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
|
||
- tech_stack: Технологический стек / Ключевые навыки (list of strings)
|
||
- location: География (e.g., Москва, Россия)
|
||
- salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
|
||
- languages: Языки (list of strings, e.g., ["Русский", "Английский"])
|
||
- education: Образование (e.g., Высшее, Среднее специальное)
|
||
- schedule: График работы (e.g., Полный день, Сменный)
|
||
- additional_requirements: Дополнительные предпочтения / требования (list of strings)
|
||
Vacancy content:
|
||
{content}
|
||
"""
|
||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
|
||
structured_llm = openai_client.with_structured_output(VacancyFeatures)
|
||
response = structured_llm.invoke(prompt)
|
||
return response
|
||
|
||
|
||
def get_next_vacancy(customer_cv):
|
||
recommended_vacancy_ids = RecommendedVacancy.objects.filter(
|
||
customer=customer_cv.customer,
|
||
).values_list('vacancy_id', flat=True)
|
||
|
||
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
||
result = search_similarities(query_filter, customer_cv.id)
|
||
if not result:
|
||
return None
|
||
|
||
search_result_id, vacancy_content, link = result
|
||
|
||
recommendation = RecommendedVacancy.objects.create(
|
||
customer=customer_cv.customer,
|
||
vacancy_id=search_result_id,
|
||
)
|
||
|
||
return recommendation, vacancy_content, link
|