diff --git a/.gitignore b/.gitignore index a9de979..a58ccd1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ # work /.work/ + +/documents.zip +/documents diff --git a/README.md b/README.md index acaeb1c..39d438c 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,9 @@ CLI tool to interact with paperless-ngx remote API ## Subcommands -- `upload`: Uploads local document(s) to Paperless instance +- `upload`: Uploads local document(s) to Paperless instance. - `consume`: Consumes a local directory and uploads each file to Paperless instance. The files will be deleted once uploaded. +- `bulk-download`: Downloads all documents at once. ## Installation diff --git a/bulk_download_command.go b/bulk_download_command.go new file mode 100644 index 0000000..b3f174d --- /dev/null +++ b/bulk_download_command.go @@ -0,0 +1,126 @@ +package main + +import ( + "fmt" + "os" + + "github.com/ccremer/clustercode/pkg/archive" + "github.com/ccremer/clustercode/pkg/paperless" + "github.com/go-logr/logr" + "github.com/urfave/cli/v2" +) + +type BulkDownloadCommand struct { + cli.Command + + PaperlessURL string + PaperlessToken string + PaperlessUser string + + TargetPath string + Content string + UnzipEnabled bool + OverwriteExistingTarget bool +} + +func newBulkDownloadCommand() *BulkDownloadCommand { + c := &BulkDownloadCommand{} + c.Command = cli.Command{ + Name: "bulk-download", + Usage: "Downloads all documents at once", + Action: actions(LogMetadata, c.Action), + Flags: []cli.Flag{ + newURLFlag(&c.PaperlessURL), + newUsernameFlag(&c.PaperlessUser), + newTokenFlag(&c.PaperlessToken), + newTargetPathFlag(&c.TargetPath), + newDownloadContentFlag(&c.Content), + newUnzipFlag(&c.UnzipEnabled), + newOverwriteFlag(&c.OverwriteExistingTarget), + }, + } + return c +} + +func (c *BulkDownloadCommand) Action(ctx *cli.Context) error { + log := logr.FromContextOrDiscard(ctx.Context) + + if prepareErr := c.prepareTarget(); prepareErr != nil { + return prepareErr + } + clt := paperless.NewClient(c.PaperlessURL, c.PaperlessUser, c.PaperlessToken) + + log.Info("Getting list of documents") + documents, queryErr := clt.QueryDocuments(ctx.Context, paperless.QueryParams{ + TruncateContent: true, + Ordering: "id", + PageSize: 100, + }) + if queryErr != nil { + return queryErr + } + documentIDs := paperless.MapToDocumentIDs(documents) + + tmpFile, createTempErr := os.CreateTemp(os.TempDir(), "paperless-bulk-download-") + if createTempErr != nil { + return fmt.Errorf("cannot open temporary file: %w", createTempErr) + } + defer os.Remove(tmpFile.Name()) // cleanup if not renamed + + log.Info("Downloading documents") + downloadErr := clt.BulkDownload(ctx.Context, tmpFile, paperless.BulkDownloadParams{ + FollowFormatting: true, + Content: paperless.BulkDownloadContent(c.Content), + DocumentIDs: documentIDs, + }) + if downloadErr != nil { + return downloadErr + } + + if c.UnzipEnabled { + return c.unzip(ctx, tmpFile) + } + return c.move(ctx, tmpFile) +} + +func (c *BulkDownloadCommand) unzip(ctx *cli.Context, tmpFile *os.File) error { + log := logr.FromContextOrDiscard(ctx.Context) + downloadFilePath := c.getTargetPath() + if unzipErr := archive.Unzip(ctx.Context, tmpFile.Name(), downloadFilePath); unzipErr != nil { + return fmt.Errorf("cannot unzip file %q to %q: %w", tmpFile.Name(), downloadFilePath, unzipErr) + } + log.Info("Unzipped archive to dir", "dir", downloadFilePath) + return nil +} + +func (c *BulkDownloadCommand) move(ctx *cli.Context, tmpFile *os.File) error { + log := logr.FromContextOrDiscard(ctx.Context) + downloadFilePath := c.getTargetPath() + if renameErr := os.Rename(tmpFile.Name(), downloadFilePath); renameErr != nil { + return fmt.Errorf("cannot move temp file: %w", renameErr) + } + log.Info("Downloaded zip archive", "file", downloadFilePath) + return nil +} + +func (c *BulkDownloadCommand) getTargetPath() string { + if c.TargetPath != "" { + return c.TargetPath + } + if c.UnzipEnabled { + return "documents" + } + return "documents.zip" +} + +func (c *BulkDownloadCommand) prepareTarget() error { + target := c.getTargetPath() + if c.OverwriteExistingTarget { + return os.RemoveAll(target) + } + _, err := os.Stat(target) + if err != nil && os.IsNotExist(err) { + return nil + } + return fmt.Errorf("target %q exists, abort", target) +} diff --git a/flags.go b/flags.go index 344b485..9c5218f 100644 --- a/flags.go +++ b/flags.go @@ -2,8 +2,10 @@ package main import ( "fmt" + "strings" "time" + "github.com/ccremer/clustercode/pkg/paperless" "github.com/urfave/cli/v2" ) @@ -114,6 +116,52 @@ func newConsumeDelayFlag(dest *time.Duration) *cli.DurationFlag { } } +func newTargetPathFlag(dest *string) *cli.StringFlag { + return &cli.StringFlag{ + Name: "target-path", EnvVars: []string{"DOWNLOAD_TARGET_PATH"}, + Usage: "target file path where documents are downloaded.", + DefaultText: "documents.zip", + Destination: dest, + } +} + +func newDownloadContentFlag(dest *string) *cli.StringFlag { + return &cli.StringFlag{ + Name: "content", EnvVars: []string{"DOWNLOAD_CONTENT"}, + Usage: "selection of document variant.", + Value: paperless.BulkDownloadArchives.String(), + Destination: dest, + Action: func(ctx *cli.Context, s string) error { + enum := []string{ + paperless.BulkDownloadArchives.String(), + paperless.BulkDownloadOriginal.String(), + paperless.BulkDownloadBoth.String()} + for _, key := range enum { + if s == key { + return nil + } + } + return fmt.Errorf("parameter %q must be one of [%s]", "content", strings.Join(enum, ", ")) + }, + } +} + +func newUnzipFlag(dest *bool) *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "unzip", EnvVars: []string{"DOWNLOAD_UNZIP"}, + Usage: "unzip the downloaded file.", + Destination: dest, + } +} + +func newOverwriteFlag(dest *bool) *cli.BoolFlag { + return &cli.BoolFlag{ + Name: "overwrite", EnvVars: []string{"DOWNLOAD_OVERWRITE"}, + Usage: "deletes existing file(s) before downloading.", + Destination: dest, + } +} + func checkEmptyString(flagName string) func(*cli.Context, string) error { return func(ctx *cli.Context, s string) error { if s == "" { diff --git a/go.mod b/go.mod index 4e2b2fc..6c6d3e5 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/fsnotify/fsnotify v1.6.0 github.com/go-logr/logr v1.2.3 github.com/pterm/pterm v0.12.51 + github.com/stretchr/testify v1.8.1 github.com/urfave/cli/v2 v2.23.7 ) @@ -15,9 +16,11 @@ require ( atomicgo.dev/keyboard v0.2.8 // indirect github.com/containerd/console v1.0.3 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/gookit/color v1.5.2 // indirect github.com/lithammer/fuzzysearch v1.1.5 // indirect github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.2.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778 // indirect @@ -25,4 +28,5 @@ require ( golang.org/x/sys v0.0.0-20220908164124-27713097b956 // indirect golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect golang.org/x/text v0.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index c904c01..08d2fb6 100644 --- a/go.sum +++ b/go.sum @@ -38,8 +38,10 @@ github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuOb github.com/klauspost/cpuid/v2 v2.1.0/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= github.com/klauspost/cpuid/v2 v2.2.0 h1:4ZexSFt8agMNzNisrsilL6RClWDC5YJnLHNIfTy4iuc= github.com/klauspost/cpuid/v2 v2.2.0/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/lithammer/fuzzysearch v1.1.5 h1:Ag7aKU08wp0R9QCfF4GoGST9HbmAIeLP7xwMrOBEp1c= github.com/lithammer/fuzzysearch v1.1.5/go.mod h1:1R1LRNk7yKid1BaQkmuLQaHruxcC4HmAH30Dh61Ih1Q= @@ -116,6 +118,7 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/main.go b/main.go index 9b2bdc7..772aa6e 100644 --- a/main.go +++ b/main.go @@ -43,6 +43,7 @@ func NewApp() *cli.App { }, Commands: []*cli.Command{ &newUploadCommand().Command, + &newBulkDownloadCommand().Command, &newConsumeCommand().Command, }, } diff --git a/pkg/archive/testdata/unzip.zip b/pkg/archive/testdata/unzip.zip new file mode 100644 index 0000000..c3556ce Binary files /dev/null and b/pkg/archive/testdata/unzip.zip differ diff --git a/pkg/archive/unzip.go b/pkg/archive/unzip.go new file mode 100644 index 0000000..760da5c --- /dev/null +++ b/pkg/archive/unzip.go @@ -0,0 +1,70 @@ +package archive + +import ( + "archive/zip" + "context" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/go-logr/logr" +) + +// Unzip reads and copies every file in the archive to the destination dir. +func Unzip(ctx context.Context, source, dest string) error { + log := logr.FromContextOrDiscard(ctx) + log.V(1).Info("Unzipping file", "source", source, "dest", dest) + archive, openErr := zip.OpenReader(source) + if openErr != nil { + return fmt.Errorf("cannot open source file: %w", openErr) + } + defer archive.Close() + + for _, f := range archive.File { + destFilePath := filepath.Join(dest, f.Name) + + if !strings.HasPrefix(destFilePath, filepath.Clean(dest)+string(os.PathSeparator)) { + return fmt.Errorf("invalid file path: %s", destFilePath) + } + if f.FileInfo().IsDir() { + log.V(2).Info("Creating directory", "dir", f.FileInfo().Name()) + if mkdirErr := os.MkdirAll(destFilePath, os.ModePerm); mkdirErr != nil { + return fmt.Errorf("cannot create directory: %w", mkdirErr) + } + continue + } + log.V(2).Info("Extracting file", "source", f.Name, "dest", destFilePath) + + err := unzipFile(f, destFilePath) + if err != nil { + return err + } + } + return nil +} + +func unzipFile(f *zip.File, destFilePath string) error { + // ensure directory exists where file should be written. + if mkdirErr := os.MkdirAll(filepath.Dir(destFilePath), os.ModePerm); mkdirErr != nil { + return fmt.Errorf("cannot create directory: %w", mkdirErr) + } + + dstFile, dstFileErr := os.OpenFile(destFilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) + if dstFileErr != nil { + return fmt.Errorf("cannot open destination file: %w", dstFileErr) + } + defer dstFile.Close() + + fileInArchive, srcFileErr := f.Open() + if srcFileErr != nil { + return fmt.Errorf("cannot open source file: %w", srcFileErr) + } + fileInArchive.Close() + + if _, copyErr := io.Copy(dstFile, fileInArchive); copyErr != nil { + return fmt.Errorf("cannot copy %q to %q: %w", f.Name, dstFile.Name(), copyErr) + } + return nil +} diff --git a/pkg/archive/unzip_test.go b/pkg/archive/unzip_test.go new file mode 100644 index 0000000..22f32aa --- /dev/null +++ b/pkg/archive/unzip_test.go @@ -0,0 +1,28 @@ +package archive + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestUnzip(t *testing.T) { + testFilePath := "testdata/unzip.zip" + testDir := "testdata/run" + + // cleanup previous test files in case of failure + require.NoError(t, os.RemoveAll(testDir)) + + err := Unzip(context.TODO(), testFilePath, testDir) + assert.NoError(t, err, "unzip failed with error") + + assert.FileExists(t, filepath.Join(testDir, "toplevel.file")) + assert.FileExists(t, filepath.Join(testDir, "Dir In Archive", "Sub Dir.file")) + + // cleanup + require.NoError(t, os.RemoveAll(testDir)) +} diff --git a/pkg/paperless/document.go b/pkg/paperless/document.go new file mode 100644 index 0000000..9a2075d --- /dev/null +++ b/pkg/paperless/document.go @@ -0,0 +1,14 @@ +package paperless + +type Document struct { + // ID of the document, read-only. + ID int `json:"id"` +} + +func MapToDocumentIDs(docs []Document) []int { + ids := make([]int, len(docs)) + for i := 0; i < len(docs); i++ { + ids[i] = docs[i].ID + } + return ids +} diff --git a/pkg/paperless/download.go b/pkg/paperless/download.go new file mode 100644 index 0000000..e986951 --- /dev/null +++ b/pkg/paperless/download.go @@ -0,0 +1,86 @@ +package paperless + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + + "github.com/go-logr/logr" +) + +type BulkDownloadContent string + +type BulkDownloadParams struct { + DocumentIDs []int + FollowFormatting bool + Content BulkDownloadContent +} + +const ( + BulkDownloadBoth BulkDownloadContent = "both" + BulkDownloadArchives BulkDownloadContent = "archive" + BulkDownloadOriginal BulkDownloadContent = "originals" +) + +// String implements fmt.Stringer. +func (c BulkDownloadContent) String() string { + return string(c) +} + +// BulkDownload downloads the documents identified by BulkDownloadParams.DocumentIDs and saves to the given targetPath. +// If targetPath is empty, it will use the suggested file name from Paperless in the current working dir. +func (clt *Client) BulkDownload(ctx context.Context, targetFile *os.File, params BulkDownloadParams) error { + req, err := clt.makeBulkDownloadRequest(ctx, params) + if err != nil { + return err + } + + log := logr.FromContextOrDiscard(ctx) + log.V(1).Info("Awaiting response") + resp, err := clt.HttpClient.Do(req) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + b, _ := io.ReadAll(resp.Body) + return fmt.Errorf("request failed: %s: %s", resp.Status, string(b)) + } + + log.V(1).Info("Writing download content to file", "file", targetFile.Name()) + _, err = io.Copy(targetFile, resp.Body) + if err != nil { + return fmt.Errorf("cannot read response body: %w", err) + } + return nil +} + +func (clt *Client) makeBulkDownloadRequest(ctx context.Context, params BulkDownloadParams) (*http.Request, error) { + log := logr.FromContextOrDiscard(ctx) + + js := map[string]any{ + "content": params.Content, + "follow_formatting": params.FollowFormatting, + "documents": params.DocumentIDs, + } + marshal, err := json.Marshal(js) + if err != nil { + return nil, fmt.Errorf("cannot serialize to JSON: %w", err) + } + body := bytes.NewReader(marshal) + + path := clt.URL + "/api/documents/bulk_download/" + log.V(1).Info("Preparing request", "path", path, "document_ids", params.DocumentIDs) + req, err := http.NewRequestWithContext(ctx, "POST", path, body) + if err != nil { + return nil, fmt.Errorf("cannot prepare request: %w", err) + } + clt.setAuth(req) + req.Header.Set("Content-Type", "application/json") + return req, nil +} diff --git a/pkg/paperless/query.go b/pkg/paperless/query.go new file mode 100644 index 0000000..94b6174 --- /dev/null +++ b/pkg/paperless/query.go @@ -0,0 +1,130 @@ +package paperless + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "reflect" + "strconv" + + "github.com/go-logr/logr" +) + +type QueryParams struct { + TruncateContent bool `param:"truncate_content"` + Ordering string `param:"ordering"` + PageSize int64 `param:"page_size"` + page int64 `param:"page"` +} + +type QueryResult struct { + Results []Document `json:"results,omitempty"` + Next string `json:"next,omitempty"` +} + +// NextPage returns the next page number for pagination. +// It returns 1 if QueryResult.Next is empty (first page), or 0 if there's an error parsing QueryResult.Next. +func (r QueryResult) NextPage() int64 { + if r.Next == "" { + return 1 // first page + } + values, err := url.ParseQuery(r.Next) + if err != nil { + return 0 + } + raw := values.Get("page") + page, err := strconv.ParseInt(raw, 10, 64) + if err != nil { + return 0 + } + return page +} + +func (clt *Client) QueryDocuments(ctx context.Context, params QueryParams) ([]Document, error) { + documents := make([]Document, 0) + params.page = 1 + for i := int64(0); i < params.page; i++ { + result, err := clt.queryDocumentsInPage(ctx, params) + if err != nil { + return nil, err + } + params.page = result.NextPage() + documents = append(documents, result.Results...) + } + return documents, nil +} + +func (clt *Client) makeQueryRequest(ctx context.Context, params QueryParams) (*http.Request, error) { + log := logr.FromContextOrDiscard(ctx) + + values := paramsToValues(params) + + path := clt.URL + "/api/documents/?" + values.Encode() + log.V(1).Info("Preparing request", "path", path) + req, err := http.NewRequestWithContext(ctx, "GET", path, nil) + if err != nil { + return nil, fmt.Errorf("cannot prepare request: %w", err) + } + clt.setAuth(req) + req.Header.Set("Content-Type", "application/json") + return req, nil +} + +func (clt *Client) queryDocumentsInPage(ctx context.Context, params QueryParams) (*QueryResult, error) { + req, err := clt.makeQueryRequest(ctx, params) + if err != nil { + return nil, err + } + + log := logr.FromContextOrDiscard(ctx) + log.V(1).Info("Awaiting response") + resp, err := clt.HttpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + b, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("cannot read body: %w", err) + } + log.V(2).Info("Read response", "body", string(b)) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("request failed: %s: %s", resp.Status, string(b)) + } + + result := QueryResult{} + parseErr := json.Unmarshal(b, &result) + if parseErr != nil { + return nil, fmt.Errorf("cannot parse JSON: %w", parseErr) + } + log.V(1).Info("Parsed response", "result", result) + return &result, nil +} + +func paramsToValues(params QueryParams) url.Values { + values := url.Values{} + typ := reflect.TypeOf(params) + value := reflect.ValueOf(params) + for i := 0; i < typ.NumField(); i++ { + structField := typ.Field(i) + tag := structField.Tag.Get("param") + field := value.Field(i) + paramValue := "" + switch field.Kind() { + case reflect.Bool: + paramValue = strconv.FormatBool(field.Bool()) + case reflect.String: + paramValue = field.String() + case reflect.Int64: + paramValue = strconv.FormatInt(field.Int(), 10) + default: + panic(fmt.Errorf("not implemented type: %s", field.Kind())) + } + values.Set(tag, paramValue) + } + return values +} diff --git a/test/docker-compose.yml b/test/docker-compose.yml index 60219fd..c9b1224 100644 --- a/test/docker-compose.yml +++ b/test/docker-compose.yml @@ -42,6 +42,7 @@ services: PAPERLESS_REDIS: redis://broker:6379 PAPERLESS_ADMIN_USER: admin PAPERLESS_ADMIN_PASSWORD: admin + PAPERLESS_FILENAME_FORMAT: "{created_year}/{correspondent}/{title}" volumes: redisdata: