62 lines
2.6 KiB
Python
62 lines
2.6 KiB
Python
from django.core.management import BaseCommand
|
||
import clickhouse_connect
|
||
from vacancies.main.vector_store import add_vectors, extract_features, qdrant_client
|
||
from vacancies.conf.settings import CLICKHOUSE_HOST, CLICKHOUSE_PORT
|
||
|
||
clickhouse_client = clickhouse_connect.create_client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT)
|
||
|
||
query = """
|
||
SELECT id, chat_username, telegram_id, message, timestamp
|
||
FROM telegram_parser_chatmessage
|
||
WHERE timestamp >= now() - INTERVAL 30 DAY
|
||
AND length(message) > 150
|
||
AND arrayCount(x -> position(message, x) > 0, [
|
||
'ваканси', 'ищем', 'требуется', 'разработчик', 'будет плюсом',
|
||
'зарплат', 'оклад', 'з/п', 'руб', 'опыт',
|
||
'требовани', 'обязанности', 'условия', 'офис',
|
||
'удаленн', 'гибкий график', 'полный день', 'занятост',
|
||
'резюме', 'собеседовани', 'junior', 'middle', 'senior', 'ждем', 'компани',
|
||
'заниматься', 'формат', 'занятость', 'вилка', 'должност', 'контакт'
|
||
]) >= 5
|
||
AND arrayCount(x -> position(lower(message), x) > 0, [
|
||
'о себе', 'обо мне', 'умею', '#ищу'
|
||
]) = 0
|
||
AND id NOT IN %(exist_points)s
|
||
"""
|
||
|
||
|
||
class Command(BaseCommand):
|
||
help = "Collect vacancies from telegram messages"
|
||
|
||
def handle(self, *args, **options):
|
||
next_page_offset = 0
|
||
exist_points_ids = [-1]
|
||
while next_page_offset is not None:
|
||
response = qdrant_client.scroll(
|
||
collection_name="vacancies",
|
||
limit=100_000,
|
||
offset=next_page_offset,
|
||
with_payload=False,
|
||
with_vectors=False,
|
||
timeout=30,
|
||
)
|
||
exist_points_ids.extend([point.id for point in response[0]])
|
||
next_page_offset = response[1]
|
||
exist_points_set = tuple(set(exist_points_ids))
|
||
|
||
result_rows = clickhouse_client.query(query, parameters={"exist_points": exist_points_set}).result_rows
|
||
result_rows_len = len(result_rows)
|
||
for index, row in enumerate(result_rows):
|
||
(id, chat_username, telegram_id, message, timestamp) = row
|
||
|
||
link = f"https://t.me/{chat_username}/{telegram_id}"
|
||
print(f"Processing {index+1}/{result_rows_len} link: {link}")
|
||
features = extract_features(message)
|
||
|
||
add_vectors(
|
||
"vacancies",
|
||
id,
|
||
features.model_dump(),
|
||
{'content': message, 'features_json': features.model_dump(), "link": link, "timestamp": timestamp},
|
||
)
|