vision-career/vacancies/main/management/commands/collect_vacancies_from_telegram_messages.py

69 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.core.management import BaseCommand
from vacancies.main.models import Vacancy
import clickhouse_connect
from vacancies.main.vector_store import add_vectors, extract_features, client as qdrant
clickhouse_client = clickhouse_connect.create_client(host="127.0.0.1", port=18123)
next_page_offset = 0
exist_points_ids = []
while next_page_offset is not None:
response = qdrant.scroll(
collection_name="vacancies",
limit=100_000,
offset=next_page_offset,
with_payload=False,
with_vectors=False,
timeout=30,
)
exist_points_ids.extend([point.id for point in response[0]])
next_page_offset = response[1]
exist_points_set = tuple(exist_points_ids)
print("qdrant vacancies points count:", len(exist_points_set))
query = f"""
SELECT id, chat_username, telegram_id, message, timestamp
FROM telegram_parser_chatmessage
WHERE timestamp >= now() - INTERVAL 30 DAY
AND length(message) > 150
AND arrayCount(x -> position(message, x) > 0, [
'вакансия', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
'зарплата', 'оклад', 'з/п', 'руб', 'опыт работы',
'требования', 'обязанности', 'условия', 'компания', 'офис',
'удаленно', 'гибкий график', 'полный день', 'частичная занятость',
'резюме', 'собеседование', 'junior', 'middle', 'senior'
]) >= 5 AND position(message, 'О себе') = 0 AND position(message, 'Обо мне') = 0 AND position(message, '#ищу') = 0
AND id NOT IN {exist_points_set}
"""
class Command(BaseCommand):
help = "Collect vacancies from telegram messages"
def handle(self, *args, **options):
import time
start_time = time.time()
result_rows = clickhouse_client.query(query).result_rows
print(f"query time: {time.time() - start_time:.4f}")
result_rows_len = len(result_rows)
for index, row in enumerate(result_rows):
(id, chat_username, telegram_id, message, timestamp) = row
link = f"https://t.me/{chat_username}/{telegram_id}"
print(f"Processing {index}/{result_rows_len} link: {link}")
start_time = time.time()
features = extract_features(message)
print(f"ai time: {time.time() - start_time:.4f}")
vacancy, created = Vacancy.objects.get_or_create(
link=link,
)
start_time = time.time()
add_vectors(
"vacancies",
vacancy.id,
features.model_dump(),
{'content': message, 'features_json': features.model_dump()},
)
print(f"write vector time: {time.time() - start_time:.4f}")