Subreddit Clustering Techniques for Community Analysis
Discover hidden community relationships through algorithmic clustering
Reddit's 100,000+ active subreddits form complex networks of overlapping communities. Clustering algorithms reveal these hidden relationships, enabling researchers to discover related communities, identify audience segments, and understand the broader Reddit ecosystem.
Market research (find adjacent audiences), content strategy (discover crossover topics), competitive analysis (identify brand discussion communities), and academic research (map online communities).
Example Cluster Output: Tech Communities
Data Sources for Clustering
Effective subreddit clustering requires diverse data signals. Each source provides different perspectives on community relationships.
| Data Source | Signal Type | Pros | Cons |
|---|---|---|---|
| User Activity Overlap | Behavioral | Strong signal, captures interest | Requires user data collection |
| Cross-post Patterns | Behavioral | Direct community link | Sparse for smaller subreddits |
| Content Similarity | Topical | Topic-based clustering | May miss behavioral connections |
| Subreddit Descriptions | Metadata | Easy to collect | Often incomplete |
| Moderator Overlap | Organizational | Administrative relationships | Not always meaningful |
User Overlap Clustering
User overlap measures how many users participate in multiple subreddits. Subreddits with high user overlap likely share audience interests and can be clustered together.
import numpy as np from scipy.sparse import csr_matrix from sklearn.cluster import AgglomerativeClustering from typing import Dict, List, Set, Tuple from collections import defaultdict class UserOverlapClusterer: """ Cluster subreddits based on user participation overlap. Uses Jaccard similarity to measure community overlap. """ def __init__(self, min_users: int = 100): self.min_users = min_users self.subreddit_users: Dict[str, Set[str]] = defaultdict(set) def add_user_activity( self, user_id: str, subreddits: List[str] ): """Record user participation in subreddits.""" for subreddit in subreddits: self.subreddit_users[subreddit.lower()].add(user_id) def build_from_posts(self, posts: List[Dict]): """Build user-subreddit mapping from post data.""" for post in posts: user = post.get('author') subreddit = post.get('subreddit') if user and subreddit and user != '[deleted]': self.subreddit_users[subreddit.lower()].add(user) def jaccard_similarity( self, set_a: Set[str], set_b: Set[str] ) -> float: """Calculate Jaccard similarity between two sets.""" if not set_a or not set_b: return 0.0 intersection = len(set_a & set_b) union = len(set_a | set_b) return intersection / union if union > 0 else 0.0 def build_similarity_matrix(self) -> Tuple[np.ndarray, List[str]]: """Build pairwise similarity matrix for all subreddits.""" # Filter subreddits by minimum users valid_subs = [ sub for sub, users in self.subreddit_users.items() if len(users) >= self.min_users ] n = len(valid_subs) similarity = np.zeros((n, n)) for i in range(n): for j in range(i, n): if i == j: similarity[i, j] = 1.0 else: sim = self.jaccard_similarity( self.subreddit_users[valid_subs[i]], self.subreddit_users[valid_subs[j]] ) similarity[i, j] = sim similarity[j, i] = sim return similarity, valid_subs def cluster( self, n_clusters: int = 10, linkage: str = 'average' ) -> Dict[str, int]: """ Cluster subreddits using hierarchical clustering. Returns mapping of subreddit to cluster ID. """ similarity, subreddits = self.build_similarity_matrix() # Convert similarity to distance distance = 1 - similarity # Hierarchical clustering clustering = AgglomerativeClustering( n_clusters=n_clusters, metric='precomputed', linkage=linkage ) labels = clustering.fit_predict(distance) return {sub: label for sub, label in zip(subreddits, labels)} def find_similar( self, subreddit: str, top_n: int = 10 ) -> List[Tuple[str, float]]: """Find most similar subreddits to a given one.""" subreddit = subreddit.lower() if subreddit not in self.subreddit_users: return [] target_users = self.subreddit_users[subreddit] similarities = [] for sub, users in self.subreddit_users.items(): if sub != subreddit and len(users) >= self.min_users: sim = self.jaccard_similarity(target_users, users) similarities.append((sub, sim)) # Sort by similarity descending similarities.sort(key=lambda x: x[1], reverse=True) return similarities[:top_n] # Usage clusterer = UserOverlapClusterer(min_users=50) # Load posts and build user map for post in posts: clusterer.add_user_activity(post['author'], [post['subreddit']]) # Find similar subreddits similar = clusterer.find_similar('python', top_n=10) for sub, sim in similar: print(f"r/{sub}: {sim:.3f}") # Cluster all subreddits clusters = clusterer.cluster(n_clusters=15)
Content-Based Clustering
Content-based clustering uses the text of posts and comments to determine subreddit similarity. Subreddits discussing similar topics cluster together regardless of user overlap.
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics.pairwise import cosine_similarity from typing import Dict, List import numpy as np class ContentBasedClusterer: """ Cluster subreddits based on post content similarity. Aggregates posts per subreddit and compares TF-IDF vectors. """ def __init__( self, max_features: int = 5000, min_posts: int = 100 ): self.vectorizer = TfidfVectorizer( max_features=max_features, stop_words='english', ngram_range=(1, 2), min_df=5, max_df=0.95 ) self.min_posts = min_posts self.subreddit_content: Dict[str, List[str]] = {} def add_post(self, subreddit: str, text: str): """Add post content for a subreddit.""" subreddit = subreddit.lower() if subreddit not in self.subreddit_content: self.subreddit_content[subreddit] = [] self.subreddit_content[subreddit].append(text) def build_vectors(self) -> Tuple[np.ndarray, List[str]]: """Build TF-IDF vectors for each subreddit.""" # Filter by minimum posts valid_subs = [ sub for sub, posts in self.subreddit_content.items() if len(posts) >= self.min_posts ] # Aggregate content per subreddit documents = [ ' '.join(self.subreddit_content[sub]) for sub in valid_subs ] # Fit and transform vectors = self.vectorizer.fit_transform(documents) return vectors, valid_subs def cluster_kmeans( self, n_clusters: int = 10 ) -> Dict[str, int]: """Cluster subreddits using K-Means.""" vectors, subreddits = self.build_vectors() kmeans = KMeans( n_clusters=n_clusters, random_state=42, n_init=10 ) labels = kmeans.fit_predict(vectors) return {sub: label for sub, label in zip(subreddits, labels)} def find_similar( self, subreddit: str, top_n: int = 10 ) -> List[Tuple[str, float]]: """Find content-similar subreddits.""" vectors, subreddits = self.build_vectors() subreddit = subreddit.lower() if subreddit not in subreddits: return [] idx = subreddits.index(subreddit) target_vector = vectors[idx] # Calculate cosine similarities similarities = cosine_similarity(target_vector, vectors)[0] # Get top similar (excluding self) similar_indices = similarities.argsort()[::-1][1:top_n + 1] return [ (subreddits[i], similarities[i]) for i in similar_indices ] def get_cluster_keywords( self, clusters: Dict[str, int], top_n: int = 10 ) -> Dict[int, List[str]]: """Extract top keywords for each cluster.""" vectors, subreddits = self.build_vectors() feature_names = self.vectorizer.get_feature_names_out() cluster_keywords = {} for cluster_id in set(clusters.values()): # Get subreddits in this cluster cluster_subs = [ sub for sub, cid in clusters.items() if cid == cluster_id and sub in subreddits ] if not cluster_subs: continue # Average vectors in cluster indices = [subreddits.index(s) for s in cluster_subs] cluster_vector = vectors[indices].mean(axis=0) cluster_vector = np.asarray(cluster_vector).flatten() # Top keywords top_indices = cluster_vector.argsort()[-top_n:][::-1] cluster_keywords[cluster_id] = [ feature_names[i] for i in top_indices ] return cluster_keywords
Embedding-Based Clustering
Neural embeddings capture semantic meaning better than TF-IDF. By embedding subreddit descriptions or aggregated content, we can find communities with similar themes even when they use different vocabulary.
from sentence_transformers import SentenceTransformer from sklearn.cluster import HDBSCAN from sklearn.manifold import TSNE import numpy as np from typing import Dict, List, Tuple class EmbeddingClusterer: """ Cluster subreddits using neural embeddings. Uses sentence transformers for semantic similarity. """ def __init__( self, model_name: str = "all-MiniLM-L6-v2" ): self.model = SentenceTransformer(model_name) self.subreddit_data: Dict[str, str] = {} self.embeddings: np.ndarray = None self.subreddits: List[str] = [] def add_subreddit( self, name: str, description: str, sample_posts: List[str] = None ): """Add subreddit with description and optional sample posts.""" # Combine description with sample post titles content = description if sample_posts: content += " " + " ".join(sample_posts[:20]) self.subreddit_data[name.lower()] = content def compute_embeddings(self): """Compute embeddings for all subreddits.""" self.subreddits = list(self.subreddit_data.keys()) texts = [self.subreddit_data[s] for s in self.subreddits] self.embeddings = self.model.encode( texts, show_progress_bar=True, convert_to_numpy=True ) def cluster_hdbscan( self, min_cluster_size: int = 5, min_samples: int = 3 ) -> Dict[str, int]: """ Cluster using HDBSCAN (density-based). Advantages: No need to specify cluster count, handles noise points, finds varying density clusters. """ if self.embeddings is None: self.compute_embeddings() clusterer = HDBSCAN( min_cluster_size=min_cluster_size, min_samples=min_samples, metric='cosine' ) labels = clusterer.fit_predict(self.embeddings) return { sub: label for sub, label in zip(self.subreddits, labels) } def find_similar( self, subreddit: str, top_n: int = 10 ) -> List[Tuple[str, float]]: """Find semantically similar subreddits.""" if self.embeddings is None: self.compute_embeddings() subreddit = subreddit.lower() if subreddit not in self.subreddits: return [] idx = self.subreddits.index(subreddit) target_embedding = self.embeddings[idx] # Cosine similarity similarities = np.dot(self.embeddings, target_embedding) norms = np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(target_embedding) similarities = similarities / norms # Sort and return top (excluding self) sorted_indices = similarities.argsort()[::-1] results = [] for i in sorted_indices: if self.subreddits[i] != subreddit: results.append((self.subreddits[i], similarities[i])) if len(results) >= top_n: break return results def get_2d_projection(self) -> Dict[str, Tuple[float, float]]: """Get 2D t-SNE projection for visualization.""" if self.embeddings is None: self.compute_embeddings() tsne = TSNE( n_components=2, perplexity=30, random_state=42 ) coords = tsne.fit_transform(self.embeddings) return { sub: (coords[i, 0], coords[i, 1]) for i, sub in enumerate(self.subreddits) } # Usage clusterer = EmbeddingClusterer() # Add subreddits with descriptions clusterer.add_subreddit( "python", "News about the Python programming language", ["How to use decorators", "Best Python IDE"] ) clusterer.add_subreddit( "javascript", "All about the JavaScript language", ["React vs Vue", "Node.js tutorial"] ) # Cluster clusters = clusterer.cluster_hdbscan(min_cluster_size=3) # Find similar similar = clusterer.find_similar("python", top_n=5)
Discover Related Communities Instantly
reddapi.dev uses semantic search to find discussions across related subreddits. No clustering setup required.
Try Cross-Subreddit SearchGraph-Based Clustering
Model subreddits as nodes in a graph where edges represent relationships (user overlap, cross-posts, etc.). Graph clustering algorithms find tightly connected communities.
import networkx as nx from community import community_louvain from typing import Dict, List, Tuple class GraphClusterer: """ Cluster subreddits using graph community detection. Uses Louvain algorithm for modularity optimization. """ def __init__(self): self.graph = nx.Graph() def add_edge( self, sub1: str, sub2: str, weight: float = 1.0 ): """Add edge between subreddits with weight.""" sub1, sub2 = sub1.lower(), sub2.lower() if self.graph.has_edge(sub1, sub2): self.graph[sub1][sub2]['weight'] += weight else: self.graph.add_edge(sub1, sub2, weight=weight) def build_from_crosspost(self, crossposts: List[Dict]): """Build graph from cross-post data.""" for cp in crossposts: source = cp.get('source_subreddit') target = cp.get('target_subreddit') if source and target: self.add_edge(source, target) def build_from_overlap( self, similarity_matrix: Dict[Tuple[str, str], float], threshold: float = 0.05 ): """Build graph from similarity matrix with threshold.""" for (sub1, sub2), sim in similarity_matrix.items(): if sim >= threshold: self.add_edge(sub1, sub2, weight=sim) def cluster_louvain( self, resolution: float = 1.0 ) -> Dict[str, int]: """ Detect communities using Louvain algorithm. Args: resolution: Higher = more smaller clusters """ partition = community_louvain.best_partition( self.graph, weight='weight', resolution=resolution ) return partition def get_cluster_stats( self, clusters: Dict[str, int] ) -> Dict[int, Dict]: """Get statistics for each cluster.""" stats = {} for cluster_id in set(clusters.values()): members = [s for s, c in clusters.items() if c == cluster_id] # Create subgraph for cluster subgraph = self.graph.subgraph(members) stats[cluster_id] = { 'size': len(members), 'members': members, 'density': nx.density(subgraph), 'avg_degree': sum(dict(subgraph.degree()).values()) / len(members) } return stats def get_bridge_subreddits( self, clusters: Dict[str, int], top_n: int = 10 ) -> List[Tuple[str, float]]: """Find subreddits that bridge multiple clusters.""" betweenness = nx.betweenness_centrality(self.graph, weight='weight') # Sort by betweenness sorted_subs = sorted( betweenness.items(), key=lambda x: x[1], reverse=True ) return sorted_subs[:top_n]
Clustering Evaluation
Evaluate clustering quality using both internal metrics (no ground truth) and external validation when available.
| Metric | Type | Interpretation | Best Value |
|---|---|---|---|
| Silhouette Score | Internal | Cluster cohesion vs separation | Close to 1 |
| Davies-Bouldin Index | Internal | Cluster similarity ratio | Close to 0 |
| Modularity | Internal (Graph) | Community structure quality | 0.3 - 0.7 |
| Adjusted Rand Index | External | Agreement with ground truth | Close to 1 |