CodeReview question markdown downloader

Question

This is an update to my earlier question From Q to compiler in less than 30 seconds.

As with that version, this Python script automatically downloads the markdown from any question on Code Review and saves it to a local file using Unix-style line endings.

For instance, to fetch the markdown for that older question, one could write:

python fetchQ 124479 fetchquestion.md

I'm interested in a general review including style, error handling or any other thing that could be improved.

This also has a new feature, which I'll be showing here soon, which is that this also serves as a companion application to a browser extension that I'm currently testing. In that mode, this same Python script will receive two arguments: the path to the native application app manifest and a special tag that identifies the application. See https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/Native_messaging for details on how the messaging works. That version uses the environment variable AUTOPROJECT_DIR to determine the directory into which the file is placed and the file is named after the question number. So this question, for example, would be saved as 234084.md.

This is intended to be used on Linux and only Python3.

fetchQ

#!/usr/bin/env python
""" Code Review question fetcher.  Given the number of the question, uses
the StackExchange API version 2.2 to fetch the markdown of the question and
write it to a local file with the name given as the second argument. """
import sys
import urllib.request
import urllib.parse
import urllib.error
import io
import os
import gzip
import json
import struct
import html.parser
from subprocess import call


def make_URL(qnumber):
    return 'https://api.stackexchange.com/2.2/questions/' + \
        str(qnumber) + \
        '/?order=desc&sort=activity&site=codereview' + \
        '&filter=!)5IYc5cM9scVj-ftqnOnMD(3TmXe'


def fetch_compressed_data(url):
    compressed = urllib.request.urlopen(url).read()
    stream = io.BytesIO(compressed)
    return gzip.GzipFile(fileobj=stream).read()


def fetch_question_markdown(qnumber):
    url = make_URL(qnumber)
    try:
        data = fetch_compressed_data(url)
    except urllib.error.URLError as err:
        if hasattr(err, 'reason'):
            print('Could not reach server.')
            print(('Reason: ', err.reason))
            sys.exit(1)
        elif hasattr(err, 'code'):
            print(f'Error: {err.code}: while fetching data from {url}')
            sys.exit(1)
    try:
        m = json.loads(data)
    except json.JSONDecodeError as err:
        print(f'Error: {err.msg}')
        sys.exit(1)
    return m['items'][0]


def getMessage():
    rawLength = sys.stdin.buffer.read(4)
    if len(rawLength) == 0:
        sys.exit(0)
    messageLength = struct.unpack('@I', rawLength)[0]
    sendMessage(encodeMessage(f'attempting to read {messageLength} bytes'))
    message = sys.stdin.buffer.read(messageLength).decode('utf-8')
    return json.loads(message)


# Encode a message for transmission,
# given its content.
def encodeMessage(messageContent):
    encodedContent = json.dumps(messageContent).encode('utf-8')
    encodedLength = struct.pack('@I', len(encodedContent))
    return {'length': encodedLength, 'content': encodedContent}


# Send an encoded message to stdout
def sendMessage(encodedMessage):
    sys.stdout.buffer.write(encodedMessage['length'])
    sys.stdout.buffer.write(encodedMessage['content'])
    sys.stdout.buffer.flush()


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print(f'Usage: {sys.argv[0]} fetchQ questionnumber mdfilename')
        sys.exit(1)
    qnumber, qname = sys.argv[1:3]
    # are we being called as a Web Extension?
    if (qname == '[email protected]'):
        msg = getMessage()
        basedir = os.getenv('AUTOPROJECT_DIR', '/tmp')
        qnumber = msg['question_id']
        qname = f'{basedir}/{qnumber}.md'
    else:
        msg = fetch_question_markdown(qnumber)

    md = html.unescape(msg['body_markdown']).replace('\r\n', '\n').encode('utf-8')
    title = html.unescape(msg['title']).encode('utf-8')
    header = b'# [{title}](https://codereview.stackexchange.com/questions/{qnumber})\n\n'
    with open(qname, 'wb') as f:
        f.write(header)
        f.write(md)
    call(["autoproject", qname])

Is there anything preventing you from using Python 3.6+? If not, using f-strings (in e.g. make_URL() would be nice — Energya
– Energya, Commented Dec 15, 2019 at 22:39
I'm actually already using f-strings, so that's definitely a possibility. — Edward
– Edward, Commented Dec 15, 2019 at 22:40
Note, you can only answer this question if you downloaded the question with the downloader. — corsiKa
– corsiKa, Commented Dec 16, 2019 at 18:24
@Carcigenicate It's a joke - because, ya know, you can view the question with the code - but only if you have the code first - which you can't get without the downloader. Apparently it's not funny. shrug — corsiKa
– corsiKa, Commented Dec 16, 2019 at 23:22
@Carcigenicate Also hello from Woodlands. And I've played RS since '04 - even met my wife on it. — corsiKa
– corsiKa, Commented Dec 16, 2019 at 23:23

Carcigenicate · Accepted Answer · 2019-12-16 23:06:59Z

PyCharm complains on this line:

m = json.loads(data)

If the above call to fetch_compressed_data fails, and the resulting error doesn't contain a reason or code attribute, the program won't close despite the error, and will then give a not-super-helpful NameError when you try to use data. I don't know if such a situation is possible, but I might add some protection just in case. Maybe add an else and move the call to exit down to reduce redundancy:

except urllib.error.URLError as err:
    if hasattr(err, 'reason'):
        print('Could not reach server.')
        print(('Reason: ', err.reason))

    elif hasattr(err, 'code'):
        print(f'Error: {err.code}: while fetching data from {url}')

    else:
        print("Unexpected problem:", err)

    sys.exit(1)

Arguably,

if len(rawLength) == 0:

would be more idiomatic as

if not rawLength:

You can rely on empty collections being falsey (and non-empty collections being truthy).

With

{'length': encodedLength, 'content': encodedContent}

This has the problem that you're needing to use strings to create and reference the "fields" of the returned object. Strings are notorious for allowing for typo problems though, and are outside of what static checking can help you with.

It's a little more involved, but I might use a NamedTuple here:

from typing import NamedTuple

class Message(NamedTuple):
    length: bytes
    content: str

...

encodedContent = json.dumps(messageContent).encode('utf-8')
encodedLength = struct.pack('@I', len(encodedContent))
return Message(encoded_length, encoded_content)

# or, for clarity (although redundant in this case)

return Message(length=encoded_length, content=encoded_content)

...

sys.stdout.buffer.write(encodedMessage.length)
sys.stdout.buffer.write(encodedMessage.content)

Now, no more messy-looking string accessing, and the IDE can assist you.

Ben A · Accepted Answer · 2020-01-14 08:20:18Z

11

Just a couple stylistic points

Function/Variable Naming

Functions and variables should be in snake_case (PEP 8).

getMessage -> get_message
encodeMessage -> encode_message
sendMessage -> send_message

Docstrings

You can include docstrings to provide some explanation for your methods and describe your parameters and return value. (PEP 8).

Something like

def make_URL(qnumber):
    """
    Creates a URL with the passed "qnumber" and returns the URL.

    :param int qnumber: Question number to query

    :return str: Formatted URL
    """

Type Hints

You can include type hints to easily identify what types are accepted and returned.

def make_URL(qnumber: str) -> str:
    ...

edited Jan 14, 2020 at 8:20

answered Dec 15, 2019 at 23:05

Ben A

10.8k5 gold badges38 silver badges103 bronze badges

\$\begingroup\$ I had used pycodestyle on the source, but it missed those things. Thanks! \$\endgroup\$

Edward
– Edward

2019-12-16 16:54:46 +00:00
Commented Dec 16, 2019 at 16:54
\$\begingroup\$ Nice, you've not learnt from my review that your param definition isn't standard. \$\endgroup\$

Peilonrayz
– Peilonrayz ♦

2019-12-16 20:24:26 +00:00
Commented Dec 16, 2019 at 20:24
\$\begingroup\$ @Peilonrayz: My Python definitely needs improvement and I'll take all the help I can get. Are you saying that the param strings should look more like :param int qnumber: description... to align with tools like Sphinx? \$\endgroup\$

Edward
– Edward

2019-12-16 20:46:33 +00:00
Commented Dec 16, 2019 at 20:46
\$\begingroup\$ @Edward Yes, exactly that. Also I personally would use the typing plugin for Sphinx to remove the need to define int twice. (If you are using Sphinx ofc) \$\endgroup\$

Peilonrayz
– Peilonrayz ♦

2019-12-16 20:49:33 +00:00
Commented Dec 16, 2019 at 20:49

Add a comment |

RomanPerekhrest · Accepted Answer · 2019-12-16 07:22:24Z

Improving HTTP communication scheme

Instead of urllib.request/urllib.error use requests lib as well-known, elegant and simple HTTP library for Python, built for human beings.

import requests
...

fetch_compressed_data function:

def fetch_compressed_data(url):
    r = requests.get(url)
    r.raise_for_status()
    return gzip.GzipFile(fileobj=io.BytesIO(r.content)).read()

fetch_question_markdown function:

def fetch_question_markdown(qnumber):
    url = make_URL(qnumber)
    try:
        data = fetch_compressed_data(url)
    except requests.exceptions.HTTPError as err:
        print(f'HTTP Error: {err.response.status_code}: while fetching data from {url}')
        sys.exit(1)
    except requests.exceptions.RequestException as err:
        print(f'Request failed: {err}')
        sys.exit(1)

    try:
        m = json.loads(data)
    except json.JSONDecodeError as err:
        print(f'Error: {err.msg}')
        sys.exit(1)
    return m['items'][0]

(Errors and expections in requests lib)

Peilonrayz · Accepted Answer · 2019-12-17 11:48:16Z

Personally I use Prospector and Flake8 with a lot of plugins. The problem with linter runners is that they don't support many of the lint tools available in the Python ecosystem. But, for the most part these two should be good enough tho.

Note: I am in the contributors for Prospector.

So let's run these programs.

$ pip install prospector[with_everything]
$ prospector --strictness veryhigh -DF -w vulture -w mypy
$ pip install flake8
$ flake8

To increase readability of this answer I've combined the output.

Docstrings
- Multi-line docstring summary should start at the second line
- 1 blank line required between summary line and description (found 0)
- Multi-line docstring closing quotes should be on a separate line
- No whitespaces allowed surrounding docstring text
- First line should end with a period
- Missing docstring in public function
Style
- function name should be lowercase
- argument name should be lowercase
- Redefining name '...' from outer scope
- Function name "..." doesn't conform to snake_case naming style
- Variable name "..." doesn't conform to snake_case naming style
- Unnecessary parens after 'if' keyword
- Constant name "..." doesn't conform to UPPER_CASE naming style
- line too long (82 > 79 characters)
Other
- Do not use len(SEQUENCE) to determine if a sequence is empty
- (false negative) Possible unbalanced tuple unpacking with sequence: left side has 2 label(s), right side has 0 value(s)
- Unused variable 'title'

The docstrings and style problems should be relatively easy to fix. The only strange comment is the constant one. This is because you have variables in global scope, which in Python is assumed to be a global constant.

#!/usr/bin/env python
"""
Code Review question fetcher.

Given the number of the question, uses the StackExchange API version 2.2
to fetch the markdown of the question and write it to a local file with
the name given as the second argument.
"""

import sys
import urllib.request
import urllib.parse
import urllib.error
import io
import os
import gzip
import json
import struct
import html.parser
from subprocess import call


def _make_url(qnumber):
    return 'https://api.stackexchange.com/2.2/questions/' + \
        str(qnumber) + \
        '/?order=desc&sort=activity&site=codereview' + \
        '&filter=!)5IYc5cM9scVj-ftqnOnMD(3TmXe'


def _fetch_compressed_data(url):
    compressed = urllib.request.urlopen(url).read()
    stream = io.BytesIO(compressed)
    return gzip.GzipFile(fileobj=stream).read()


def _fetch_question_markdown(qnumber):
    url = _make_url(qnumber)
    try:
        data = _fetch_compressed_data(url)
    except urllib.error.URLError as err:
        if hasattr(err, 'reason'):
            print('Could not reach server.')
            print(('Reason: ', err.reason))
            sys.exit(1)
        elif hasattr(err, 'code'):
            print(f'Error: {err.code}: while fetching data from {url}')
            sys.exit(1)
    try:
        message = json.loads(data)
    except json.JSONDecodeError as err:
        print(f'Error: {err.msg}')
        sys.exit(1)
    return message['items'][0]


def _get_message():
    raw_length = sys.stdin.buffer.read(4)
    if len(raw_length) == 0:
        sys.exit(0)
    message_length = struct.unpack('@I', raw_length)[0]
    _send_message(_encode_message(
        f'attempting to read {message_length} bytes'
    ))
    message = sys.stdin.buffer.read(message_length).decode('utf-8')
    return json.loads(message)


# Encode a message for transmission,
# given its content.
def _encode_message(message_content):
    encoded_content = json.dumps(message_content).encode('utf-8')
    encoded_length = struct.pack('@I', len(encoded_content))
    return {'length': encoded_length, 'content': encoded_content}


# Send an encoded message to stdout
def _send_message(encoded_message):
    sys.stdout.buffer.write(encoded_message['length'])
    sys.stdout.buffer.write(encoded_message['content'])
    sys.stdout.buffer.flush()


def _main():
    if len(sys.argv) != 3:
        print(f'Usage: {sys.argv[0]} fetchQ questionnumber mdfilename')
        sys.exit(1)
    qnumber, qname = sys.argv[1:3]
    # are we being called as a Web Extension?
    if qname == '[email protected]':
        msg = _get_message()
        basedir = os.getenv('AUTOPROJECT_DIR', '/tmp')
        qnumber = msg['question_id']
        qname = f'{basedir}/{qnumber}.md'
    else:
        msg = _fetch_question_markdown(qnumber)

    markdown = (
        html.unescape(msg['body_markdown'])
            .replace('\r\n', '\n')
            .encode('utf-8')
    )
    title = html.unescape(msg['title']).encode('utf-8')
    header = (
        b'# [{title}]'
        b'(https://codereview.stackexchange.com/questions/{qnumber})\n\n'
    )
    with open(qname, 'wb') as question_file:
        question_file.write(header)
        question_file.write(markdown)
    call(["autoproject", qname])


if __name__ == '__main__':
    _main()

Don't use if len(foo) != 0: instead use if foo:
You didn't prepend an f to your header string to add title or qnumber to it. It should be noted that fb'' and bf'' are not valid Python syntax.
Using \ rather than () to split lines is discouraged. I'm surprised the linters didn't pick this up.
Using sys.stdout and sty.stdin is very rare. The only time I've had to use them is when I was interacting with subprocess.Popen or had low level interactions with the terminal.
sys.exit isn't really something you see in Python. If you have an error use an error.

As a quick monkey patch I'll move all the error handling outside the main.
urllib.request is discouraged in the Python documentation for most users. Please, upgrade to requests. This can remove the need for fetch_compressed_data and fetch_question_markdown.
I don't see the point in having encodeMessage and sendMessage as two separate functions. I'd personally just used a sized print.
I don't really see the point in using bytes all the time. In Python 3 strings are UTF-8 internally. So, to me, you're just making life harder by using bytes.

In total this would look more like this untested code. I can't really simplify _get_stdin_message as it would require breaking changes.

#!/usr/bin/env python
"""
Code Review question fetcher.

Given the number of the question, uses the StackExchange API version 2.2
to fetch the markdown of the question and write it to a local file with
the name given as the second argument.
"""

import sys
import os
import json
import struct
import html.parser
import subprocess

import requests


class _SilentError(Exception):
    pass


def _fetch_se_question(question_id):
    url = (
        f'https://api.stackexchange.com/2.2/questions/'
        f'{question_id}'
        f'/?order=desc&sort=activity&site=codereview'
        f'&filter=!)5IYc5cM9scVj-ftqnOnMD(3TmXe'
    )
    r = requests.get(url)
    r.raise_for_status()
    return r.json()['items'][0]


def _sized_print(content):
    length = struct.pack('@I', len(content))
    print(f'{length}{content}', end='')


def _get_stdin_message():
    raw_length = sys.stdin.buffer.read(4)
    if not raw_length:
        raise _SilentError('Message is empty')
    message_length = struct.unpack('@I', raw_length)[0]
    _sized_print(json.dumps(f'attempting to read {message_length} bytes'))
    message = sys.stdin.buffer.read(message_length).decode('utf-8')
    return json.loads(message)


def _main_inner():
    if len(sys.argv) != 3:
        raise ValueError(
            f'Usage: {sys.argv[0]} fetchQ questionnumber mdfilename'
        )

    q_id, file_name = sys.argv[1:3]
    # are we being called as a Web Extension?
    if file_name != '[email protected]':
        msg = _fetch_se_question(q_id)
    else:
        msg = _get_stdin_message()
        basedir = os.getenv('AUTOPROJECT_DIR', '/tmp')
        q_id = msg['question_id']
        file_name = f'{basedir}/{q_id}.md'

    with open(file_name, 'w') as question_file:
        title = html.unescape(msg['title'])
        question_file.write(
            f'# [{title}]'
            f'(https://codereview.stackexchange.com/questions/{q_id})\n\n'
        )
        question_file.write(
            html.unescape(msg['body_markdown'])
                .replace('\r\n', '\n')
        )

    subprocess.call(["autoproject", file_name])


def _main():
    try:
        _main_inner()
    except _SilentError:
        pass
    except Exception as err:
        print(f'{type(err).__qualname__}: {err}')
    else:
        return
    sys.exit(1)


if __name__ == '__main__':
    _main()

Stack Exchange Network

CodeReview question markdown downloader

fetchQ

4 Answers 4

Function/Variable Naming

Docstrings

Type Hints

Improving HTTP communication scheme

You must log in to answer this question.

Linked

Hot Network Questions

CodeReview question markdown downloader

fetchQ

4 Answers 4

Function/Variable Naming

Docstrings

Type Hints

Improving HTTP communication scheme

You must log in to answer this question.

Linked

Related

Hot Network Questions