172 lines
6.6 KiB
Python
172 lines
6.6 KiB
Python
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||
from qdrant_client import QdrantClient, models
|
||
from qdrant_client.models import Filter, HasIdCondition
|
||
|
||
from vacancies.conf.settings import QDRANT_URL
|
||
from vacancies.main.models import RecommendedVacancy, VacancyFeatures
|
||
|
||
qdrant_client = QdrantClient(url=QDRANT_URL)
|
||
|
||
FEATURE_NAMES = [
|
||
"job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
|
||
"location", "salary_range", "languages", "education", "schedule", "additional_requirements"
|
||
]
|
||
|
||
weights = {
|
||
"job_title": 70,
|
||
"tech_stack": 10,
|
||
"salary_range": 10,
|
||
}
|
||
|
||
vectors_config = {
|
||
name: models.VectorParams(size=3072, distance=models.Distance.COSINE) for name in FEATURE_NAMES
|
||
}
|
||
|
||
if not qdrant_client.collection_exists("vacancies"):
|
||
qdrant_client.create_collection(
|
||
collection_name="vacancies",
|
||
vectors_config=vectors_config,
|
||
)
|
||
qdrant_client.create_payload_index(
|
||
collection_name="vacancies",
|
||
field_name="timestamp",
|
||
field_schema="datetime",
|
||
)
|
||
if not qdrant_client.collection_exists("cvs"):
|
||
qdrant_client.create_collection(
|
||
collection_name="cvs",
|
||
vectors_config=vectors_config,
|
||
)
|
||
|
||
embedding = OpenAIEmbeddings(model="text-embedding-3-large")
|
||
|
||
def _prepare_texts(features):
|
||
texts = {}
|
||
for name in FEATURE_NAMES:
|
||
value = features.get(name)
|
||
if isinstance(value, list):
|
||
text = " ".join(value) if value else ""
|
||
else:
|
||
text = str(value) if value else ""
|
||
texts[name] = text
|
||
return texts
|
||
|
||
|
||
def embed_features(features):
|
||
features = {key: value for key, value in features.items() if value}
|
||
features_texts = _prepare_texts(features)
|
||
names, texts = features_texts.keys(), features_texts.values()
|
||
vectors = dict(zip(names, embedding.embed_documents(texts)))
|
||
return vectors
|
||
|
||
|
||
def add_vectors(collection_name: str, _id: int, features: dict, payload: dict, vectors):
|
||
max_similarities = {}
|
||
for name, vec in vectors.items():
|
||
results = qdrant_client.query_points(collection_name="vacancies", query=vec, using=name, limit=100)
|
||
for res in results.points:
|
||
max_similarities.setdefault(res.id, {})
|
||
max_similarities[res.id][name] = res.score
|
||
|
||
scored = []
|
||
for vid, feature_sims in max_similarities.items():
|
||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||
scored.append({"id": vid, "score": total})
|
||
|
||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||
if scored and scored[0]["score"] > 80: # threshold
|
||
return
|
||
|
||
qdrant_client.upsert(
|
||
collection_name=collection_name,
|
||
points=[models.PointStruct(id=_id, vector=vectors, payload=payload)]
|
||
)
|
||
|
||
|
||
def search_similarities(query_filter: Filter, cv_id: int):
|
||
cv = qdrant_client.retrieve(collection_name="cvs", ids=[cv_id], with_vectors=True)[0]
|
||
|
||
max_similarities, vacancies_content = {}, {}
|
||
for name, vec in cv.vector.items():
|
||
results = qdrant_client.query_points(
|
||
collection_name="vacancies",
|
||
query=vec,
|
||
using=name,
|
||
limit=100000,
|
||
with_payload=True,
|
||
query_filter=query_filter,
|
||
)
|
||
for res in results.points:
|
||
max_similarities.setdefault(res.id, {})
|
||
vacancies_content.setdefault(res.id, {})
|
||
|
||
max_similarities[res.id][name] = res.score
|
||
vacancies_content[res.id]["content"] = res.payload["content"]
|
||
vacancies_content[res.id]["features_json"] = res.payload["features_json"]
|
||
vacancies_content[res.id]["link"] = res.payload["link"]
|
||
|
||
scored = []
|
||
for vid, feature_sims in max_similarities.items():
|
||
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
|
||
scored.append({
|
||
"id": vid,
|
||
"score": total,
|
||
"content": vacancies_content[vid]["content"],
|
||
"features_json": vacancies_content[vid]["features_json"],
|
||
"link": vacancies_content[vid]["link"],
|
||
"sims": feature_sims,
|
||
})
|
||
|
||
scored.sort(key=lambda x: x["score"], reverse=True)
|
||
|
||
return scored[0]["id"], scored[0]["content"], scored[0]["link"]
|
||
|
||
|
||
def batch_extract_features(contents: list[str]) -> list[VacancyFeatures]:
|
||
prompts = [
|
||
f"""
|
||
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
|
||
Features:
|
||
- job_title: Должность (e.g., DevOps, Python программист)
|
||
- employment_type: Тип занятости (e.g., Полная занятость, Частичная)
|
||
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
|
||
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта)
|
||
- position_level: Уровень позиции (e.g., Junior, Senior)
|
||
- industry: Отрасль / Сфера деятельности (e.g., IT, Финансы)
|
||
- tech_stack: Технологический стек / Ключевые навыки (list of strings)
|
||
- location: География (e.g., Москва, Россия)
|
||
- salary_range: Зарплатные ожидания / вилка (e.g., 100000-200000 руб)
|
||
- languages: Языки (list of strings, e.g., ["Русский", "Английский"])
|
||
- education: Образование (e.g., Высшее, Среднее специальное)
|
||
- schedule: График работы (e.g., Полный день, Сменный)
|
||
- additional_requirements: Дополнительные предпочтения / требования (list of strings)
|
||
Vacancy content:
|
||
{content}
|
||
"""
|
||
for content in contents
|
||
]
|
||
openai_client = ChatOpenAI(model_name="gpt-5-mini", reasoning_effort="minimal", temperature=0, seed=42, top_p=1)
|
||
structured_llm = openai_client.with_structured_output(VacancyFeatures)
|
||
response = structured_llm.batch(prompts)
|
||
return response
|
||
|
||
|
||
def get_next_vacancy(customer_cv):
|
||
recommended_vacancy_ids = RecommendedVacancy.objects.filter(
|
||
customer=customer_cv.customer,
|
||
).values_list('vacancy_id', flat=True)
|
||
|
||
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
|
||
result = search_similarities(query_filter, customer_cv.id)
|
||
if not result:
|
||
return None
|
||
|
||
search_result_id, vacancy_content, link = result
|
||
|
||
recommendation = RecommendedVacancy.objects.create(
|
||
customer=customer_cv.customer,
|
||
vacancy_id=search_result_id,
|
||
)
|
||
|
||
return recommendation, vacancy_content, link
|