2
votes

I am trying to write a script to download images from Reddit using praw, saving the images to a folder of my choice, and exporting a .csv of the results.

I think I have coded it right since the images download, I am just getting an "Arrays must be the same length" error when I try to run the script.

I think this may have something to do with the "path" field in my dictionary, but the loops look like they're appending the information properly so I don't know. I am missing 2 entries from "path" I have no idea where they are dropping.

My code is below:

#! python3
import praw
import pandas as pd
import requests

path = r'C:\\Scripts\\IMG\\'

#Reddit API Tokens
reddit = praw.Reddit(client_id='x', \
                client_secret='x', \
                user_agent='x', \
                username='x', \
                password='x')

x_dict = {"id":[], \
            "title":[], \
            "url":[], \
            "path":[]}
submissions = reddit.subreddit('x').hot(limit=100)

for submission in submissions:
    x_dict["id"].append(submission.id)
    x_dict["title"].append(submission.title)
    x_dict["url"].append(submission.url)

    if submission.url.endswith(".gifv"):
        submission.url = submission.url.replace('.com/', '.com/download/')
        submission.url = (submission.url + ".mp4")
        r = requests.get(submission.url, allow_redirects=True)
        if "gif" in r.headers['Content-Type']:
            dir2 = os.path.join(path, submission.id + ".gif")
            submission.url = (submission.url + ".gif")
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)
        else:
            dir2 = os.path.join(path, submission.id + ".mp4")
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)
    elif "gfycat" in submission.url:
        if "https://" in submission.url:
            dir2 = os.path.join(path, submission.id + ".mp4")
            submission.url = submission.url.replace('https://', 'https://giant.')
            submission.url = (submission.url + ".mp4")
            r = requests.get(submission.url, allow_redirects=True)
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)
        else:
            dir2 = os.path.join(path, submission.id + ".mp4")
            submission.url = submission.url.replace('http://', 'http://giant.')
            submission.url = (submission.url + ".mp4")
            r = requests.get(submission.url, allow_redirects=True)
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)
    elif "i.redd" in submission.url:
        if submission.url.endswith(".jpg"):
            dir2 = os.path.join(path, submission.id + ".jpg")
            r = requests.get(submission.url, allow_redirects=True)
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)
        elif submission.url.endswith(".jpeg"):
            dir2 = os.path.join(path, submission.id + ".jpeg")
            r = requests.get(submission.url, allow_redirects=True)
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)         
        elif submission.url.endswith(".png"):
            dir2 = os.path.join(path, submission.id + ".png")
            r = requests.get(submission.url, allow_redirects=True)
            open(dir2, 'wb').write(r.content)
            print ("downloading " + submission.id + " to " + dir2)
            x_dict["path"].append(dir2)
    elif "v.redd" in submission.url:
        dir2 = os.path.join(path, submission.id + ".mp4")
        r = requests.get(submission.media['reddit_video']['fallback_url'], allow_redirects=True)
        open(dir2, 'wb').write(r.content)
        print ("downloading " + submission.id + " to " + dir2)
        x_dict["path"].append(dir2)
    elif submission.url is None:
        print ("\\ " + submission.id + " url is none")
        x_dict["path"].append('')
    else:
        print ("\\" + submission.id + " not supported")
        x_dict["path"].append('')
        continue
print (len(x_dict["id"]))
print (len(x_dict["title"]))
print (len(x_dict["url"]))
print (len(x_dict["path"]))
x_data = pd.DataFrame(x_dict)
x_data.to_csv(os.path.join(path,'xscrape.csv'))

Output is as follows

downloading 99rdbf to C:\\Scripts\\IMG\\99rdbf.jpg
100
100
100
98
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-434-0d78dff7cb84> in <module>()
 89 print (len(x_dict["url"]))
 90 print (len(x_dict["path"]))
---> 91 x_data = pd.DataFrame(x_dict)
     92 x_data.to_csv(os.path.join(path,'xscrape.csv'))

d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-    packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
346                                  dtype=dtype, copy=copy)
347         elif isinstance(data, dict):
--> 348             mgr = self._init_dict(data, index, columns, dtype=dtype)
    349         elif isinstance(data, ma.MaskedArray):
    350             import numpy.ma.mrecords as mrecords

d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _init_dict(self, data, index, columns, dtype)
    457             arrays = [data[k] for k in keys]
    458 
--> 459         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    460 
    461     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in _arrays_to_mgr(arrays, arr_names, index,     columns, dtype)
   7313     # figure out the index, if necessary
   7314     if index is None:
-> 7315         index = extract_index(arrays)
   7316 
   7317     # don't force copy because getting jammed in an ndarray anyway

d:\Users\localuser\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in extract_index(data)
   7359             lengths = list(set(raw_lengths))
   7360             if len(lengths) > 1:
-> 7361                 raise ValueError('arrays must all be same length')
   7362 
   7363             if have_dicts:

ValueError: arrays must all be same length
1
Include the output of the program and, especially, the exception you're seeing; that will allow people to see, rather than guess, what line of code is producing the error. Also, I'm pretty sure your indentation here does not match your actual program; try using triple-backticks above and below the code in your markdown and pasting the code itself without changing the indentation. - cjs
The line producing the exception (the call to pd.DataFrame()) appears to be a comment in the code you posted. Don't trust yourself that the code you posted would produce the error if it were changed; delete the entire code block in your post and re-paste it again using the exact contents of the file you ran that produced the output. - cjs
Updated as requested, sorry for the wait as I am a new user here. Thank you for your patience. - cipherbear
No problem. It's not trivial to write questions well; learning to do so is part of learning programming and will take some time. - cjs

1 Answers

0
votes

The core problem here is your data structure design: it makes it easy to fall into programming errors rather than helping to prevent them.

In this answer I'm going to use a standard programmer trick: I'm not even going to try to figure out what the problem is in the current code, but simply restructure things so that problem can no longer appear.

In a CSV file each line is a sequence of closely related items. In turn, the whole file is a sequence of these lines. You want to keep the more closely related items closer together in the data structure, so your "inside" data structure of the two lists should be a sequence of the fields in a single line, and the "outside" data structure should be a sequence of the lines, which is the opposite of what you've done.

In Python there are two very common sequence data structures: list, which you already know about and are using here, and tuple which is similar to list but immutable.

For this program it's worth learning and understanding the namedtuple data structure, which is a tuple but extended with field names and a constructor that will ensure you're always using the same number of arguments. The latter is yet another data structure design decision that will help you avoid programming errors.

Define your data structure for a CSV line as follows:

from collections import namedtuple
Download = namedtuple('Download', 'id title url path')

(It's worth typing this directly into a Python interpreter (python -i or ipython) and playing around with it a bit until you get comfortable with creating and showing named tuples.)

You can then build a list of these as you do your downloads. Since a tuple is immutable we need to build it a single call to the constructor, we can create it only after we have all the information we need to do so. Then we add it to the list.

def download(id, url):
    # All the stuff you need to do an individual download here.
    return path

downloads = []
for s in submissions:
    path = download(s.id, s.url)
    dl = Download.new(s.id, s.title, s.url, path)
    downloads.append(dl)

You don't need to install Pandas to write CSV files; there's a csv module in the standard library that does a fine job. Working from an example in its documentation:

import csv

with open(os.path.join(path,'xscrape.csv'), 'w', newline='') as out:
    writer = csv.writer(out)
    writer.writerows(downloads)

(This produces a CSV file without a header line; adding one I leave as an exercise for the reader.)