From bada1c2a2ab859b483696d152114259a254480e5 Mon Sep 17 00:00:00 2001 From: Adrian Freund Date: Tue, 21 Sep 2021 15:23:14 +0200 Subject: [PATCH 1/3] Added pep2bib.py for generating bibtex entries --- .gitignore | 1 + Makefile | 4 ++ pep2bib.py | 131 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 ++ 4 files changed, 139 insertions(+) create mode 100755 pep2bib.py diff --git a/.gitignore b/.gitignore index b9c89215748..f348631f4f3 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ pep-0000.txt pep-0000.rst pep-????.html peps.rss +peps.bib __pycache__ *.pyc *.pyo diff --git a/Makefile b/Makefile index 0f201b0c04a..e5858d29c64 100644 --- a/Makefile +++ b/Makefile @@ -27,12 +27,16 @@ pep-0000.rst: $(wildcard pep-????.txt) $(wildcard pep-????.rst) $(wildcard pep0/ rss: $(PYTHON) pep2rss.py . +bib: pep-0000.rst + $(PYTHON) pep2bib.py . + install: echo "Installing is not necessary anymore. It will be done in post-commit." clean: -rm pep-0000.rst -rm *.html + -rm *.bib -rm -rf build update: diff --git a/pep2bib.py b/pep2bib.py new file mode 100755 index 00000000000..6287c0c64b0 --- /dev/null +++ b/pep2bib.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +# usage: python3 pep2bib.py . + +import datetime +import glob +import os +import re +import sys +import time +from pybtex.database import Entry, BibliographyData + +BIB_PATH = os.path.join(sys.argv[1], 'peps.bib') + + +def firstline_startingwith(full_path, text): + result = None + for line in open(full_path, encoding="utf-8"): + if result is not None: + if not line[0].strip(): # Line begins with whitespace + result += line + else: + return result + if line.startswith(text): + result = line[len(text):].strip() + return None + + +def pep_creation_dt(full_path): + created_str = firstline_startingwith(full_path, 'Created:') + # bleh, I was hoping to avoid re but some PEPs editorialize + # on the Created line + m = re.search(r'''(\d+-\w+-\d{4})''', created_str) + if not m: + # some older ones have an empty line, that's okay, if it's old + # we ipso facto don't care about it. + # "return None" would make the most sense but datetime objects + # refuse to compare with that. :-| + return datetime.datetime(*time.localtime(0)[:6]) + created_str = m.group(1) + try: + t = time.strptime(created_str, '%d-%b-%Y') + except ValueError: + t = time.strptime(created_str, '%d-%B-%Y') + return datetime.datetime(*t[:6]) + + +def pep_number(full_path): + n_str = full_path.split('-')[-1].split('.')[0] + try: + n = int(n_str) + except ValueError: + raise Exception("Can't parse pep number %s" % n_str) + + return n + + +name_first_regex = re.compile('(.*)<.*>') +mail_first_regex = re.compile('.*\((.*)\)') +name_only_regex = re.compile('(.*)') + +months = { + 1: 'jan', + 2: 'feb', + 3: 'mar', + 4: 'apr', + 5: 'may', + 6: 'jun', + 7: 'jul', + 8: 'aug', + 9: 'sep', + 10: 'oct', + 11: 'nov', + 12: 'dec', +} + + +def clean_authors(authors_str): + authors = authors_str.split(',') + cleaned = [] + for author in authors: + match = name_first_regex.match(author) + if match is None: + match = mail_first_regex.match(author) + if match is None: + match = name_only_regex.match(author) + cleaned.append(match.group(1).strip()) + return " and ".join(cleaned) + + +def main(): + # get list of peps with creation time + # (from "Created:" string in pep .rst or .txt) + peps = glob.glob('pep-*.txt') + peps.extend(glob.glob('pep-*.rst')) + + peps_with_dt = [(pep_number(full_path), pep_creation_dt(full_path), full_path) for full_path in peps] + # sort peps by date, newest first + peps_with_dt.sort() + + # generate rss items for 10 most recent peps + items = {} + for n, dt, full_path in peps_with_dt: + title = firstline_startingwith(full_path, 'Title:') + authors = firstline_startingwith(full_path, 'Author:') + authors = clean_authors(authors) + url = 'https://www.python.org/dev/peps/pep-%0.4d/' % n + item = Entry('techreport', [ + ('author', authors), + ('title', 'PEP %d: %s' % (n, title)), + ('institution', "Python Software Foundation"), + ('year', str(dt.year)), + ('month', months[dt.month]), + ('type', 'PEP'), + ('number', str(n)), + ('url', url) + ]) + items['pep%d' % n] = item + + bib = BibliographyData(items) + bib_str = bib.to_string('bibtex') + + # pybtex always quotes strings, but we want month strings unquoted, so bib styles can replace it + bib_str = re.sub('month = "(.*)"', r'month = \1', bib_str) + + with open(BIB_PATH, 'w', encoding="utf-8") as fp: + fp.write(bib_str) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt index 837f41b3ef7..66436c1b78b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,6 @@ docutils >= 0.16 # For RSS feedgen >= 0.9.0 # For RSS feed + +# For bibliography +pybtex >= 0.24.0 \ No newline at end of file From aef5fa974d71dd7e4b891d4055ca09ca60e2ecc2 Mon Sep 17 00:00:00 2001 From: Adrian Freund Date: Tue, 21 Sep 2021 17:20:09 +0200 Subject: [PATCH 2/3] Pull duplicated code into extra file --- pep2bib.py | 64 ++++++++---------------------------------- pep2rss.py | 35 ++++------------------- pep_parsing_helpers.py | 56 ++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 83 deletions(-) create mode 100644 pep_parsing_helpers.py diff --git a/pep2bib.py b/pep2bib.py index 6287c0c64b0..865f05f0a28 100755 --- a/pep2bib.py +++ b/pep2bib.py @@ -2,62 +2,21 @@ # usage: python3 pep2bib.py . -import datetime import glob import os import re import sys -import time from pybtex.database import Entry, BibliographyData +from pep_parsing_helpers import pep_number, pep_creation_dt, first_line_starting_with, parse_authors + BIB_PATH = os.path.join(sys.argv[1], 'peps.bib') -def firstline_startingwith(full_path, text): - result = None - for line in open(full_path, encoding="utf-8"): - if result is not None: - if not line[0].strip(): # Line begins with whitespace - result += line - else: - return result - if line.startswith(text): - result = line[len(text):].strip() - return None - - -def pep_creation_dt(full_path): - created_str = firstline_startingwith(full_path, 'Created:') - # bleh, I was hoping to avoid re but some PEPs editorialize - # on the Created line - m = re.search(r'''(\d+-\w+-\d{4})''', created_str) - if not m: - # some older ones have an empty line, that's okay, if it's old - # we ipso facto don't care about it. - # "return None" would make the most sense but datetime objects - # refuse to compare with that. :-| - return datetime.datetime(*time.localtime(0)[:6]) - created_str = m.group(1) - try: - t = time.strptime(created_str, '%d-%b-%Y') - except ValueError: - t = time.strptime(created_str, '%d-%B-%Y') - return datetime.datetime(*t[:6]) - - -def pep_number(full_path): - n_str = full_path.split('-')[-1].split('.')[0] - try: - n = int(n_str) - except ValueError: - raise Exception("Can't parse pep number %s" % n_str) - - return n - - -name_first_regex = re.compile('(.*)<.*>') -mail_first_regex = re.compile('.*\((.*)\)') -name_only_regex = re.compile('(.*)') +name_first_regex = re.compile(r'(.*)<.*>') +mail_first_regex = re.compile(r'.*\((.*)\)') +name_only_regex = re.compile(r'(.*)') + months = { 1: 'jan', @@ -75,8 +34,7 @@ def pep_number(full_path): } -def clean_authors(authors_str): - authors = authors_str.split(',') +def authors_to_bib(authors): cleaned = [] for author in authors: match = name_first_regex.match(author) @@ -98,12 +56,12 @@ def main(): # sort peps by date, newest first peps_with_dt.sort() - # generate rss items for 10 most recent peps items = {} for n, dt, full_path in peps_with_dt: - title = firstline_startingwith(full_path, 'Title:') - authors = firstline_startingwith(full_path, 'Author:') - authors = clean_authors(authors) + title = first_line_starting_with(full_path, 'Title:') + author_string = first_line_starting_with(full_path, 'Author:') + authors = parse_authors(author_string) + authors = authors_to_bib(authors) url = 'https://www.python.org/dev/peps/pep-%0.4d/' % n item = Entry('techreport', [ ('author', authors), diff --git a/pep2rss.py b/pep2rss.py index 52b532f51d6..9337fe29f8d 100755 --- a/pep2rss.py +++ b/pep2rss.py @@ -5,15 +5,15 @@ import datetime import glob import os -import re import sys -import time import PyRSS2Gen as rssgen import docutils.frontend import docutils.nodes import docutils.parsers.rst import docutils.utils +from pep_parsing_helpers import pep_creation_dt, first_line_starting_with, parse_authors + RSS_PATH = os.path.join(sys.argv[1], 'peps.rss') @@ -53,38 +53,12 @@ def pep_abstract(full_path: str) -> str: return abstract -def firstline_startingwith(full_path, text): - for line in open(full_path, encoding="utf-8"): - if line.startswith(text): - return line[len(text):].strip() - return None - - # get list of peps with creation time # (from "Created:" string in pep .rst or .txt) peps = glob.glob('pep-*.txt') peps.extend(glob.glob('pep-*.rst')) -def pep_creation_dt(full_path): - created_str = firstline_startingwith(full_path, 'Created:') - # bleh, I was hoping to avoid re but some PEPs editorialize - # on the Created line - m = re.search(r'''(\d+-\w+-\d{4})''', created_str) - if not m: - # some older ones have an empty line, that's okay, if it's old - # we ipso facto don't care about it. - # "return None" would make the most sense but datetime objects - # refuse to compare with that. :-| - return datetime.datetime(*time.localtime(0)[:6]) - created_str = m.group(1) - try: - t = time.strptime(created_str, '%d-%b-%Y') - except ValueError: - t = time.strptime(created_str, '%d-%B-%Y') - return datetime.datetime(*t[:6]) - - peps_with_dt = [(pep_creation_dt(full_path), full_path) for full_path in peps] # sort peps by date, newest first peps_with_dt.sort(reverse=True) @@ -96,8 +70,9 @@ def pep_creation_dt(full_path): n = int(full_path.split('-')[-1].split('.')[0]) except ValueError: pass - title = firstline_startingwith(full_path, 'Title:') - author = firstline_startingwith(full_path, 'Author:') + title = first_line_starting_with(full_path, 'Title:') + authors = first_line_starting_with(full_path, 'Author:') + author = parse_authors(authors)[0] # RSS only supports one author abstract = pep_abstract(full_path) url = 'https://www.python.org/dev/peps/pep-%0.4d/' % n item = rssgen.RSSItem( diff --git a/pep_parsing_helpers.py b/pep_parsing_helpers.py new file mode 100644 index 00000000000..e94159afb8a --- /dev/null +++ b/pep_parsing_helpers.py @@ -0,0 +1,56 @@ +import re +import datetime +import time + + +def first_line_starting_with(full_path, text): + result = None + for line in open(full_path, encoding="utf-8"): + if result is not None: + if not line[0].strip(): # Line begins with whitespace + result += line + else: + return result + if line.startswith(text): + result = line[len(text):].strip() + return None + + +def pep_creation_dt(full_path): + created_str = first_line_starting_with(full_path, 'Created:') + # bleh, I was hoping to avoid re but some PEPs editorialize + # on the Created line + m = re.search(r'''(\d+-\w+-\d{4})''', created_str) + if not m: + # some older ones have an empty line, that's okay, if it's old + # we ipso facto don't care about it. + # "return None" would make the most sense but datetime objects + # refuse to compare with that. :-| + return datetime.datetime(*time.localtime(0)[:6]) + created_str = m.group(1) + try: + t = time.strptime(created_str, '%d-%b-%Y') + except ValueError: + t = time.strptime(created_str, '%d-%B-%Y') + return datetime.datetime(*t[:6]) + + +def pep_number(full_path): + n_str = full_path.split('-')[-1].split('.')[0] + try: + n = int(n_str) + except ValueError: + raise Exception("Can't parse pep number %s" % n_str) + + return n + + +def parse_authors(authors_str): + orig_authors = authors_str.split(',') + authors = [] + for author in orig_authors: + authors.append(author.strip()) + + return authors + + From a43c159c4a0f07d15adc43922261845f3abc3a58 Mon Sep 17 00:00:00 2001 From: Adrian Freund Date: Tue, 21 Sep 2021 17:22:21 +0200 Subject: [PATCH 3/3] Fixed outdated comment --- pep2bib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pep2bib.py b/pep2bib.py index 865f05f0a28..00150cc1738 100755 --- a/pep2bib.py +++ b/pep2bib.py @@ -53,7 +53,7 @@ def main(): peps.extend(glob.glob('pep-*.rst')) peps_with_dt = [(pep_number(full_path), pep_creation_dt(full_path), full_path) for full_path in peps] - # sort peps by date, newest first + # sort peps by number peps_with_dt.sort() items = {}