base_path = "/Users/johnreid/Downloads/bbcsport/"
genres = ["athletics", "cricket", "football", "rugby", "tennis"]
def read_and_split_file(filename):
with open(filename, 'r', encoding="latin-1") as f:
lines = f.readlines() # Get lines as a list of strings
lines = list(map(str.strip, lines)) # Remove /n characters
lines = list(filter(None, lines)) # Remove empty strings
def get_df_from_genre(path, genre):
files = glob.glob(path + genre + "/*.txt")
lines = read_and_split_file(f)
titles.append(lines[0]) # First line is the title
subtitles.append(lines[1]) # Second line is the subtitle
bodies.append(' '.join(lines[2:])) # Combine all the rest
final_df = pd.concat([get_df_from_genre(base_path, g) for g in genres])