Fix vacancies recommendations weights

This commit is contained in:
estromenko 2025-10-30 00:30:37 +03:00
parent cf9f19a216
commit 9dceaaeccc
4 changed files with 36 additions and 41 deletions

View File

@ -5,23 +5,7 @@ from vacancies.main.vector_store import add_vectors, extract_features, client as
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123) clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
next_page_offset = 0 query = """
exist_points_ids = []
while next_page_offset is not None:
response = qdrant.scroll(
collection_name="vacancies",
limit=100_000,
offset=next_page_offset,
with_payload=False,
with_vectors=False,
timeout=30,
)
exist_points_ids.extend([point.id for point in response[0]])
next_page_offset = response[1]
exist_points_set = tuple(exist_points_ids)
print("qdrant vacancies points count:", len(exist_points_set))
query = f"""
SELECT id, chat_username, telegram_id, message, timestamp SELECT id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage FROM telegram_parser_chatmessage
WHERE timestamp >= now() - INTERVAL 30 DAY WHERE timestamp >= now() - INTERVAL 30 DAY
@ -33,7 +17,7 @@ WHERE timestamp >= now() - INTERVAL 30 DAY
'удаленно', 'гибкий график', 'полный день', 'частичная занятость', 'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
'резюме', 'собеседование', 'junior', 'middle', 'senior' 'резюме', 'собеседование', 'junior', 'middle', 'senior'
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0 ]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
AND id NOT IN {exist_points_set} AND id NOT IN %(exist_points)s
""" """
@ -41,28 +25,36 @@ class Command(BaseCommand):
help = "Collect vacancies from telegram messages" help = "Collect vacancies from telegram messages"
def handle(self, *args, **options): def handle(self, *args, **options):
import time next_page_offset = 0
start_time = time.time() exist_points_ids = [-1]
result_rows = clickhouse_client.query(query).result_rows while next_page_offset is not None:
print(f"query time: {time.time() - start_time:.4f}") response = qdrant.scroll(
collection_name="vacancies",
limit=100_000,
offset=next_page_offset,
with_payload=False,
with_vectors=False,
timeout=30,
)
exist_points_ids.extend([point.id for point in response[0]])
next_page_offset = response[1]
exist_points_set = tuple(set(exist_points_ids))
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows
result_rows_len = len(result_rows) result_rows_len = len(result_rows)
for index, row in enumerate(result_rows): for index, row in enumerate(result_rows):
(id, chat_username, telegram_id, message, timestamp) = row (id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/{chat_username}/{telegram_id}" link = f"https://t.me/{chat_username}/{telegram_id}"
print(f"Processing {index}/{result_rows_len} link: {link}") print(f"Processing {index}/{result_rows_len} link: {link}")
start_time = time.time()
features = extract_features(message) features = extract_features(message)
print(f"ai time: {time.time() - start_time:.4f}")
vacancy, created = Vacancy.objects.get_or_create( vacancy, created = Vacancy.objects.get_or_create(
link=link, link=link,
) )
start_time = time.time()
add_vectors( add_vectors(
"vacancies", "vacancies",
vacancy.id, vacancy.id,
features.model_dump(), features.model_dump(),
{'content': message, 'features_json': features.model_dump()}, {'content': message, 'features_json': features.model_dump()},
) )
print(f"write vector time: {time.time() - start_time:.4f}")

View File

@ -1,6 +1,7 @@
import asyncio import asyncio
from django.core.management import BaseCommand from django.core.management import BaseCommand
from vacancies.main.vector_store import search_similarities, client from vacancies.main.vector_store import search_similarities
from vacancies.main.models import CustomerCV, RecommendedVacancy from vacancies.main.models import CustomerCV, RecommendedVacancy
from vacancies.main.bot import application from vacancies.main.bot import application
from telegram import InlineKeyboardButton, InlineKeyboardMarkup from telegram import InlineKeyboardButton, InlineKeyboardMarkup
@ -19,12 +20,7 @@ class Command(BaseCommand):
).values_list('vacancy_id', flat=True) ).values_list('vacancy_id', flat=True)
query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)]) query_filter = Filter(must_not=[HasIdCondition(has_id=recommended_vacancy_ids)])
search_result_id = search_similarities(query_filter, customer_cv.id) search_result_id, vacancy_content = search_similarities(query_filter, customer_cv.id)
vacancy_content = client.retrieve(
collection_name="vacancies",
ids=[search_result_id],
)[0].payload["content"]
recommendation = RecommendedVacancy.objects.create( recommendation = RecommendedVacancy.objects.create(
customer=customer_cv.customer, customer=customer_cv.customer,

View File

@ -62,6 +62,7 @@ class RecommendedVacancy(models.Model):
class VacancyFeatures(BaseModel): class VacancyFeatures(BaseModel):
job_title: str | None = None # Должность
employment_type: str | None = None # Тип занятости employment_type: str | None = None # Тип занятости
work_format: str | None = None # Формат работы work_format: str | None = None # Формат работы
experience: str | None = None # Опыт работы experience: str | None = None # Опыт работы

View File

@ -9,15 +9,16 @@ from vacancies.main.models import VacancyFeatures
client = QdrantClient(url="http://localhost:6333") client = QdrantClient(url="http://localhost:6333")
FEATURE_NAMES = [ FEATURE_NAMES = [
"employment_type", "work_format", "experience", "position_level", "industry", "tech_stack", "job_title", "employment_type", "work_format", "experience", "position_level", "industry", "tech_stack",
"location", "salary_range", "languages", "education", "schedule", "additional_requirements" "location", "salary_range", "languages", "education", "schedule", "additional_requirements"
] ]
weights = { weights = {
"job_title": 10,
"employment_type": 2, "employment_type": 2,
"work_format": 2, "work_format": 2,
"experience": 4, "experience": 3,
"position_level": 4, "position_level": 5,
"industry": 4, "industry": 4,
"tech_stack": 5, "tech_stack": 5,
"location": 2, "location": 2,
@ -87,35 +88,40 @@ def search_similarities(query_filter: Filter, cv_id: int) -> list[dict]:
)[0].vector )[0].vector
max_similarities = {} max_similarities = {}
vacancies_content = {}
for name, vec in vectors.items(): for name, vec in vectors.items():
if any(v != 0 for v in vec): if any(v != 0 for v in vec):
results = client.search( results = client.query_points(
collection_name="vacancies", collection_name="vacancies",
query_vector=(name, vec), query=vec,
using=name,
limit=1000, limit=1000,
with_payload=True, with_payload=True,
query_filter=query_filter, query_filter=query_filter,
) )
for res in results: for res in results.points:
vid = res.id vid = res.id
sim = res.score sim = res.score
if vid not in max_similarities: if vid not in max_similarities:
max_similarities[vid] = {} max_similarities[vid] = {}
max_similarities[vid][name] = sim max_similarities[vid][name] = sim
if vid not in vacancies_content:
vacancies_content[vid] = res.payload["content"]
scored = [] scored = []
for vid, feature_sims in max_similarities.items(): for vid, feature_sims in max_similarities.items():
total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims) total = sum(feature_sims[feature] * weights.get(feature, 1) for feature in feature_sims)
scored.append({"id": vid, "score": total}) scored.append({"id": vid, "score": total, "content": vacancies_content[vid]})
scored.sort(key=lambda x: x["score"], reverse=True) scored.sort(key=lambda x: x["score"], reverse=True)
return scored[0]["id"] return scored[0]["id"], scored[0]["content"]
def extract_features(content: str) -> VacancyFeatures: def extract_features(content: str) -> VacancyFeatures:
prompt = f""" prompt = f"""
Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null. Extract the following features from the job vacancy description. If a feature is not mentioned, set it to null.
Features: Features:
- job_title: Должность (e.g., DevOps, Python программист)
- employment_type: Тип занятости (e.g., Полная занятость, Частичная) - employment_type: Тип занятости (e.g., Полная занятость, Частичная)
- work_format: Формат работы (e.g., Офис, Удалённо, Гибрид) - work_format: Формат работы (e.g., Офис, Удалённо, Гибрид)
- experience: Опыт работы (e.g., 3-5 лет, Нет опыта) - experience: Опыт работы (e.g., 3-5 лет, Нет опыта)