From de2e522e1a97f48475a3250826098db512544ca9 Mon Sep 17 00:00:00 2001 From: Knut Ahlers Date: Sat, 20 Jun 2015 23:12:29 +0200 Subject: [PATCH] Initial version --- .gitignore | 1 + LICENSE | 13 ++++++ README.md | 36 ++++++++++++++++ main.go | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5bf74d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +clean_couch diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4fde5d2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2015 Knut Ahlers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..122f201 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# Luzifer / clean_couch + +[![License: Apache 2.0](http://badge.luzifer.io/v1/badge?color=5d79b5&title=license&text=Apache%202.0)](http://www.apache.org/licenses/LICENSE-2.0) + +This utility emerged from the need to delete about 20k documents from a CouchDB database with more than 600k documents. As I did not want to delete every document by hand and had no other way to delete documents by a specific filter. + +## Usage + +1. Create a view which filters the documents in your database with exactly this emit line you can see in this example + +```javascript +function(doc) { + if (doc.user == "usertodelete") { + emit(doc._rev, null); + } +} +``` + +2. Execute with parameters + +```bash +# ./clean_couch +Usage of ./clean_couch: + --baseurl="http://localhost:5984": BaseURL of your CouchDB instance + --concurrency=50: How many delete requests should get processed concurrently? + --database="": The database containing your view and the data to delete + --view="": The view selecting the data to delete + +# ./clean_couch --database=userdata --view=_design/del/_view/usertodelete +``` + +## Warnings + +- If you set the concurrency above 1024 either `clean_couch` or even the CouchDB server might break because of a limit in open file descriptors +- If the database has many views you could overload your server because views need to get recalculated +(My CouchDB server survived a concurrency of 100 with minimal load) diff --git a/main.go b/main.go new file mode 100644 index 0000000..f408d7b --- /dev/null +++ b/main.go @@ -0,0 +1,118 @@ +package main + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/cenkalti/backoff" + "github.com/spf13/pflag" +) + +var config = struct { + CouchBaseURL string + Database string + View string + Routines int + + // Private storage + totalNumberOfDocuments int + processedDocuments int + processChannel chan bool + concurrencyChannel chan bool +}{ + processChannel: make(chan bool, 10), +} + +func main() { + pflag.StringVar(&config.CouchBaseURL, "baseurl", "http://localhost:5984", "BaseURL of your CouchDB instance") + pflag.StringVar(&config.Database, "database", "", "The database containing your view and the data to delete") + pflag.StringVar(&config.View, "view", "", "The view selecting the data to delete") + pflag.IntVar(&config.Routines, "concurrency", 20, "How many delete requests should get processed concurrently?") + pflag.Parse() + + if config.Database == "" || config.View == "" { + pflag.Usage() + return + } + + delData := struct { + Rows []struct { + ID string `json:"id"` + Rev string `json:"key"` + } `json:"rows"` + }{} + + err := backoff.Retry(func() error { + req, _ := http.NewRequest("GET", fmt.Sprintf("%s/%s/%s", config.CouchBaseURL, config.Database, config.View), nil) + res, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer res.Body.Close() + + if err := json.NewDecoder(res.Body).Decode(&delData); err != nil { + return err + } + + return nil + }, backoff.NewExponentialBackOff()) + if err != nil { + fmt.Printf("Tried to get the view but did not succeed: %s", err) + return + } + + config.totalNumberOfDocuments = len(delData.Rows) + config.processedDocuments = 0 + + config.concurrencyChannel = make(chan bool, config.Routines) + + go func() { + for _, row := range delData.Rows { + // Blocks when concurrency channel is full + config.concurrencyChannel <- true + + go func(finChan chan bool, conChan chan bool, id, rev string) { + // Retry deletes + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Second + + err := backoff.Retry(func() error { + url := fmt.Sprintf("%s/%s/%s?rev=%s", config.CouchBaseURL, config.Database, id, rev) + req, _ := http.NewRequest("DELETE", url, nil) + res, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + res.Body.Close() + return nil + }, bo) + if err != nil { + fmt.Printf("Unable to delete document with ID %s", id) + } + // Increase finished counter + finChan <- true + + // Remove self from concurrency limit + <-conChan + }(config.processChannel, config.concurrencyChannel, row.ID, row.Rev) + } + }() + + ticker := time.NewTicker(time.Second) + + for { + select { + case <-config.processChannel: + config.processedDocuments++ + if config.processedDocuments == config.totalNumberOfDocuments { + fmt.Print("\n\n") + return + } + case <-ticker.C: + fmt.Printf("Processed %d of %d documents.\r", config.processedDocuments, config.totalNumberOfDocuments) + } + } + +}