From 0429c2f1ceef84d2e7505591a0fee9babce7eb52 Mon Sep 17 00:00:00 2001 From: terorie Date: Sun, 29 Jul 2018 04:09:59 +0200 Subject: [PATCH] Channel url dump CLI --- browseajax/get.go | 10 ++--- browseajax/parse.go | 37 ++++++++++++----- channel.go | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++ main.go | 3 +- 4 files changed, 149 insertions(+), 16 deletions(-) create mode 100644 channel.go diff --git a/browseajax/get.go b/browseajax/get.go index 0b7e94e..c75384b 100644 --- a/browseajax/get.go +++ b/browseajax/get.go @@ -1,9 +1,9 @@ package browseajax -func GetPage(channelID string, page uint) error { +func GetPage(channelID string, page uint) ([]string, error) { root, err := GrabPage(channelID, page) - if err != nil { return err } - err = ParsePage(root) - if err != nil { return err } - return nil + if err != nil { return nil, err } + urls, err := ParsePage(root) + if err != nil { return nil, err } + return urls, nil } diff --git a/browseajax/parse.go b/browseajax/parse.go index 60abbe2..9060b99 100644 --- a/browseajax/parse.go +++ b/browseajax/parse.go @@ -3,14 +3,16 @@ package browseajax import ( "github.com/valyala/fastjson" "errors" + "strings" ) -var missingData = errors.New("missing data") +var MissingData = errors.New("missing data") +var ServerError = errors.New("server error") -func ParsePage(rootObj *fastjson.Value) error { +func ParsePage(rootObj *fastjson.Value) ([]string, error) { // Root as array root, err := rootObj.Array() - if err != nil { return err } + if err != nil { return nil, err } // Find response container var container *fastjson.Value @@ -20,9 +22,18 @@ func ParsePage(rootObj *fastjson.Value) error { break } } - if container == nil { return missingData } + if container == nil { return nil, MissingData + } // Get error obj + errorExists := container.Exists( + "response", + "responseContext", + "errors", + "error", + ) + if errorExists { return nil, ServerError + } // Get items from grid itemsObj := container.Get( @@ -31,11 +42,14 @@ func ParsePage(rootObj *fastjson.Value) error { "gridContinuation", "items", ) - if itemsObj == nil { return missingData } + if itemsObj == nil { return nil, MissingData + } // Items as array items, err := itemsObj.Array() - if err != nil { return err } + if err != nil { return nil, err } + + urls := make([]string, 0) // Enumerate for _, item := range items { @@ -47,14 +61,17 @@ func ParsePage(rootObj *fastjson.Value) error { "webCommandMetadata", "url", ) - if urlObj == nil { return missingData } + if urlObj == nil { return nil, MissingData + } // URL as string urlBytes, err := urlObj.StringBytes() - if err != nil { return err } + if err != nil { return nil, err } url := string(urlBytes) - println(url) + if strings.HasPrefix(url, "/watch?v") { + urls = append(urls, "https://www.youtube.com" + url) + } } - return nil + return urls, nil } diff --git a/channel.go b/channel.go new file mode 100644 index 0000000..3e9fe54 --- /dev/null +++ b/channel.go @@ -0,0 +1,115 @@ +package main + +import ( + "github.com/spf13/cobra" + "github.com/terorie/yt-mango/browseajax" + "regexp" + "fmt" + "os" + "net/url" + "strings" + "log" + "time" + "bufio" +) + +var force bool +var offset uint32 + +var channelCmd = cobra.Command{ + Use: "channel", + Short: "Get information about a channel", +} + +var matchChannelID = regexp.MustCompile("^([\\w\\-]|(%3[dD]))+$") + +var channelDumpCmd = cobra.Command{ + Use: "dumpurls ", + Short: "Get all public video URLs from channel", + Long: "Write all videos URLs of a channel to a file", + Args: cobra.ExactArgs(2), + Run: func(cmd *cobra.Command, args []string) { + channelID := args[0] + fileName := args[1] + + if !matchChannelID.MatchString(channelID) { + // Check if youtube.com domain + _url, err := url.Parse(channelID) + if err != nil || (_url.Host != "www.youtube.com" && _url.Host != "youtube.com") { + fmt.Fprintln(os.Stderr, "Not a channel ID:", channelID) + os.Exit(1) + } + + // Check if old /user/ URL + if strings.HasPrefix(_url.Path, "/user/") { + // TODO Implement extraction of channel ID + fmt.Fprintln(os.Stderr, "New /channel/ link is required!\n" + + "The old /user/ links do not work.") + os.Exit(1) + } + + // Remove /channel/ path + channelID = strings.TrimPrefix(_url.Path, "/channel/") + if len(channelID) == len(_url.Path) { + // No such prefix to be removed + fmt.Fprintln(os.Stderr, "Not a channel ID:", channelID) + os.Exit(1) + } + + // Remove rest of path from channel ID + slashIndex := strings.IndexRune(channelID, '/') + if slashIndex != -1 { + channelID = channelID[:slashIndex] + } + } + + log.Printf("Starting work on channel ID \"%s\".", channelID) + startTime := time.Now() + + var flags int + if force { + flags = os.O_WRONLY | os.O_CREATE | os.O_TRUNC + } else { + flags = os.O_WRONLY | os.O_CREATE | os.O_EXCL + } + + file, err := os.OpenFile(fileName, flags, 0640) + defer file.Close() + writer := bufio.NewWriter(file) + defer writer.Flush() + + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + totalURLs := 0 + for i := offset; true; i++ { + channelURLs, err := browseajax.GetPage(channelID, uint(i)) + if err != nil { + log.Printf("Aborting on error %v.", err) + break + } + if len(channelURLs) == 0 { + log.Printf("Page %d returned no videos.", i) + break + } + totalURLs += len(channelURLs) + log.Printf("Received page %d: %d videos.", i, len(channelURLs)) + + for _, _url:= range channelURLs { + _, err := writer.WriteString(_url + "\n") + if err != nil { panic(err) } + } + } + + duration := time.Since(startTime) + log.Printf("Got %d URLs in %s.", totalURLs, duration.String()) + }, +} + +func init() { + channelDumpCmd.Flags().BoolVarP(&force, "force", "f", false, "Overwrite the output file if it already exists") + channelDumpCmd.Flags().Uint32Var(&offset, "page-offset", 1, "Start getting videos at this page. (A page is usually 30 videos)") + channelCmd.AddCommand(&channelDumpCmd) +} diff --git a/main.go b/main.go index 3f7005f..d46ff00 100644 --- a/main.go +++ b/main.go @@ -21,7 +21,7 @@ func main() { Use: "yt-mango", Short: "YT-Mango is a scalable video metadata archiver", Long: "YT-Mango is a scalable video metadata archiving utility\n" + - "written by terorie with help from the-eye.eu", + "written by terorie for https://the-eye.eu/", } versionCmd := cobra.Command{ @@ -31,6 +31,7 @@ func main() { } rootCmd.AddCommand(&versionCmd) + rootCmd.AddCommand(&channelCmd) if err := rootCmd.Execute(); err != nil { fmt.Fprintln(os.Stderr, err) -- 2.11.4.GIT