Source code for labtools.slack.load

import json
import os
import pandas as pd
from tqdm import tqdm
from typing import Union, Optional
from datetime import datetime
from pathlib import Path
import requests


[docs]def clean_messages(messages: dict) -> pd.DataFrame: """ Clean the raw json messages from a slack export Args: messages (dict): dictionary of {'channel': [messages]} from :func:`.load_messages` Returns: :class:`pandas.DataFrame` : cleaned dataframe of messages """ clean_messages = [] # iter through each channel and its list of messages for channel, channel_messages in messages.items(): for message in channel_messages: # some message types don't have usernames attached for some reason try: display_name = message['user_profile']['display_name'] real_name = message['user_profile']['real_name'] except KeyError: display_name = None real_name = None files = message.get('local_files', []) # simplify the message message_simple = { 'channel': channel, 'timestamp': datetime.fromtimestamp(float(message['ts'])), 'display_name':display_name, 'real_name': real_name, 'files': files, 'text': message['text'] } clean_messages.append(message_simple) # create dataframe and return return pd.DataFrame(clean_messages)
[docs]def download_attachments(messages: dict, base_dir: Path, download_subdir: str = 'files', verbose: bool = True) -> dict: """ Download all linked attachments and save them in `download_subdir` beneath `base_dir`. Args: messages (dict): Dict of (uncleaned) messages from :func:`.load_messages` base_dir (:class:`pathlib.Path`): base directory, beneath which to save the files download_subdir (str): subdirectory beneath `base_dir` into which the files should be downloaded. verbose (bool): if True, show progress bars and messages Returns: dict: dict like input but with a list of local files """ if verbose: print('Downloading attachments...') download_dir = Path(base_dir) / download_subdir if not download_dir.exists(): download_dir.mkdir(parents=True) # count all messages n_messages = 0 for msgs in messages.values(): n_messages += len(msgs) if verbose: msg_pbar = tqdm(total=n_messages, position=0) download_pbar = tqdm(total=0, unit='iB', unit_scale=True, position=1) for channel, channel_messages in messages.items(): for i, message in enumerate(channel_messages): local_files = [] for file in message.get('files', []): if 'url_private_download' not in file.keys(): continue file_name = f"{file['id']}_{file['name']}" file_path = download_dir / file_name local_files.append(file_name) if file_path.exists(): # have already downloaded continue # download that file as a stream! with requests.get(file['url_private_download'], stream=True) as req: if verbose: total_size_in_bytes = int(req.headers.get('content-length', 0)) block_size = 1024 # 1 Kibibyte download_pbar.reset(total=total_size_in_bytes) with open(file_path, 'wb') as open_file: for data in req.iter_content(block_size): if verbose: download_pbar.update(len(data)) open_file.write(data) if len(local_files)>0: messages[channel][i]['local_files'] = local_files if verbose: msg_pbar.update() if verbose: msg_pbar.close() download_pbar.close() return messages
[docs]def load_messages(base_dir: str, clean : bool = True, download : bool = True) -> Union[dict, pd.DataFrame]: """ Load slack .json messages from an export, optionally cleaning and downloading attachments. Arguments: base_dir (str): base directory of slack export clean (bool): if True (default), also :func:`.clean_messages` before returning download (bool): if True (default), download attached files with :func:`.download_attachments` Returns: (dict, :class:`pandas.DataFrame`): dict of {'channel': [messages]} """ messages = {} for subdir in os.listdir(base_dir): # iterate through folders in base directory subdir_complete = os.path.join(base_dir, subdir) if os.path.isdir(subdir_complete): # a folder contains .json files of messages for each day # create a list of messages for a subdirectory (channel) messages[subdir] = [] for json_file in os.listdir(subdir_complete): if json_file.endswith('.json'): with open(os.path.join(base_dir, subdir, json_file), 'r') as jfile: json_messages = json.load(jfile) messages[subdir].extend(json_messages) if download: messages = download_attachments(messages, Path(base_dir).resolve()) if clean: messages = clean_messages(messages) return messages