require 'fileutils'
require 'csv'
require File.expand_path(File.dirname(__FILE__) + "/mbox.rb")
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
=begin
PREREQUISITES and INFORMATION:
COOKIES:
in order to be able to extract users' email addresses correctly from the Google Group, you will need
to have access to a Manager account of the Google Group. Having logged into Google Groups with this Manager account,
export the cookies.txt from your browser (I used this Chrome extension to get the cookies.txt file:
https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg)
(Without this step, the email addresses cannot be harvested from the Google Group, and this will mess up creation
of new users on Discourse)
Once you have the cookies.txt file, the easiest way to get it into your Docker container is to upload it as an attachment
to a post in your discourse forum. You can get the URL from the post, and you need to prepend '/var/www/discourse/public' to the URL,
which will be something like '/uploads/default/original/1X/245aa0cdc6847cf59647e1c7102e253e99d40b69.txt'
INSTRUCTIONS:
**run this script from INSIDE your Discourse Docker container**
$ ssh <your-discourse-server>
$ cd /var/discourse
$ ./launcher enter app
# apt install sqlite3 libsqlite3-dev git
# gem install sqlite3
# cd /var/www/discourse/script/import_scripts
# open mbox.rb, and comment out the very last line that calls "perform"
# download this script from the internet and save as "googlegroups.rb"
# su discourse
# ruby googlegroups.rb <name-of-your-google-group-goes-here>
=end
class ImportScripts::GoogleGroups < ImportScripts::Mbox
CATEGORY_MAPPINGS = {
"default" => "googlegroup"
}
def initialize(google_group_name)
if google_group_name.blank?
raise "No google group name specified!"
end
@google_group_name = google_group_name # your google group name
@first_time = true # true: scrap all, false: scrap using -rss option
@use_cookies = false # use manager cookie to recover email
@load_users = true # use exported user list to recover email
# @copy_from = "/shared/google-group-crawler" # copy from a mounted folder
@users = [] # store users from the csv file
setup_google_group
load_users if @load_users
super()
end
def execute
scrape_google_group_to_mbox if @first_time
update_google_group_to_mbox unless @first_time
super
end
# a valid csv file called #{@google_group_name}.csv from google groups
# is expected in the /tmp folder if @load_users = true
def load_users
# 0 Email address
# 1 Nickname
# 2 Group status
# 3 Email status
# 4 Email preference
# 5 Posting permissions
# 6 Join year
# 7 Join month
# 8 Join day
# 9 Join hour
# 10 Join minute
# 11 Join second
# 12 Time zone
puts "Loading users from /tmp/#{@google_group_name}.csv"
users = []
CSV.foreach(File.path("/tmp/#{@google_group_name}.csv")) do |record|
user = {}
user["email"] = record[0]
user["nickname"] = record[1]
users << user
end
@users = users
puts
end
# cross match users to recover email
def match_user(users, email)
return email if email.nil?
i = users.index { |user| user["email"] =~ /#{email.gsub(/\.+@/,".*@")}/}
if i
users[i]
else
email
end
end
# a valid cookie file called cookies.txt from google groups
# is expected in the /tmp folder if @use_cookies = true
def setup_google_group
ENV['_GROUP'] = @google_group_name
ENV['_WGET_OPTIONS'] = "--load-cookies /tmp/cookies.txt --keep-session-cookies" if @use_cookies
puts ""
puts "Your Google Group name is #{@google_group_name}"
puts "So I'm expecting the Google Group URL to be https://groups.google.com/forum/#!forum/#{@google_group_name}"
puts "First time importing? #{@first_time.to_s}"
puts "Use /tmp/cookies.txt? #{@use_cookies.to_s}"
puts "Use /tmp/#{@google_group_name}.csv? #{@load_users.to_s}"
if @copy_from.present?
puts "Copying existing scrapper from #{@copy_from} to /tmp"
system "rm -rf /tmp/google-group-crawler"
system "cp -r #{@copy_from} /tmp"
system "cp #{@copy_from}/cookies.txt /tmp" if @use_cookies
system "cp #{@copy_from}/#{@google_group_name}.csv /tmp" if @load_users
end
system "chmod -R 777 /tmp/google-group-crawler"
end
# scrape content of the Google Group using https://github.com/icy/google-group-crawler
# do everything in /tmp/
def scrape_google_group_to_mbox
FileUtils.rm_rf("/tmp/google-group-crawler") # idempotent
puts "Clone the Google Group Crawler from icy ..."
system 'git clone https://github.com/icy/google-group-crawler /tmp/google-group-crawler'
# perform the scrape
Dir.chdir '/tmp/google-group-crawler/' do
system 'chmod +x ./crawler.sh'
puts "Start the first pass collection of topics"
system './crawler.sh -sh > wget.sh'
system 'chmod +x ./wget.sh'
puts "Iterate through topics to get messages"
system './wget.sh'
system "chmod -R 777 #{@google_group_name}"
end
end
def update_google_group_to_mbox
# perform the scrape
Dir.chdir '/tmp/google-group-crawler/' do
system 'chmod +x ./crawler.sh'
puts "Update topics"
system './crawler.sh -rss > update.sh'
system 'chmod +x ./update.sh'
puts "Iterate through topics to get messages"
system './update.sh'
system "chmod -R 777 #{@google_group_name}"
end
end
# override
def open_db
SQLite3::Database.new("/tmp/google-group-crawler/index.db")
end
# override
def all_messages
puts "Loading all messages"
files = Dir["/tmp/google-group-crawler/#{@google_group_name}/mbox/*"]
files.each_with_index do |f, idx|
raw = File.read(f)
mail = Mail.read_from_string(raw)
yield mail, f
print_status(idx, files.size)
end
end
# override
def import_categories
mappings = CATEGORY_MAPPINGS.values - ['uncategorized']
create_categories(mappings) do |c|
{id: c, name: c}
end
end
# override
def extract_name(mail)
from_name = nil
from = mail[:from]
from_email = nil
if mail.from.present?
from_email = mail.from.dup
if from_email.kind_of?(Array)
from_email = from_email.first.dup
end
from_email.gsub!(/ at /, '@')
from_email.gsub!(/ \(.*$/, '')
search = match_user(@users, from_email)
from_email = search["email"] unless search.nil?
from_name = search["nickname"] unless search.nil? or search["nickname"].blank?
end
display_names = from.try(:display_names)
if display_names.present?
from_name = display_names.first
end
if from_name.blank? && from.to_s =~ /\(([^\)]+)\)/
from_name = Regexp.last_match[1]
end
from_name = from.to_s if from_name.blank?
[from_email, from_name]
end
def find_all_topics
db = open_db
title2id = {}
rows = db.execute "
SELECT msg_id, title FROM emails AS f
WHERE datetime(f.email_date) = (
SELECT min(datetime(e.email_date)) FROM emails AS e
WHERE e.title = f.title
);
"
rows.each do |row|
title2id[row[1]] = row[0]
end
title2id
ensure
db.close
end
# override
def massage_indices
db = open_db
mappings = find_all_topics
puts "#{mappings.size} topics in total"
str_ids = (mappings.values.map {|id| "'#{id}'"}).join(',')
db.execute "UPDATE emails SET reply_to = null WHERE msg_id in (#{str_ids})"
puts "wiring up replies for these topics"
mappings.each_with_index do |kv, idx|
db.execute "UPDATE emails SET reply_to = ? WHERE title = ? AND msg_id <> ?", [kv[1], kv[0], kv[1]]
print_status(idx, mappings.size)
end
ensure
db.close
end
# override
def create_email_indices
db = open_db
db.execute "DROP TABLE IF EXISTS emails"
db.execute <<-SQL
CREATE TABLE emails (
msg_id VARCHAR(995) PRIMARY KEY,
from_email VARCHAR(255) NOT NULL,
from_name VARCHAR(255) NOT NULL,
title VARCHAR(255) NOT NULL,
reply_to VARCHAR(955) NULL,
email_date DATETIME NOT NULL,
message TEXT NOT NULL,
category VARCHAR(255) NOT NULL
);
SQL
db.execute "CREATE INDEX by_title ON emails (title)"
db.execute "CREATE INDEX by_email ON emails (from_email)"
puts "", "creating indices"
all_messages do |mail, filename|
category = CATEGORY_MAPPINGS['default'] || 'uncategorized'
msg_id = mail['Message-ID'].to_s
from_email, from_name = extract_name(mail)
title = clean_title(mail['Subject'].to_s)
reply_to = mail['In-Reply-To'].to_s
date = mail['date'].to_s
email_date = ""
if date.blank?
email_date = date
else
email_date = DateTime.parse(date).to_s
end
db.execute "INSERT OR IGNORE INTO emails
(msg_id,
from_email,
from_name,
title,
reply_to,
email_date,
message,
category)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
[msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s, category]
end
ensure
db.close
end
end
ImportScripts::GoogleGroups.new(ARGV[0]).perform