Replace series of Unicode characters / Python / Twitter - python-3.x

I am pulling text from a tweet using the Twitter API and Python 3.3 and I'm running into the part of the tweet where the tweeter put three symbols in the tweet. They are shown below.
The two flags and the thumbs up seem to be causing the problem. The following is the plain text tweet.
RT #John_Hunt07: Just voted for #marcorubio is Florida! I am ready for a New American Century!! #FLPrimary \ud83c\uddfa\ud83c\uddf8\ud83c\uddfa\ud83c\uddf8\ud83d\udc4d
The following is the code I'm using.
import json
import mysql.connector
import sys
from datetime import datetime
from MySQLCL import MySQLCL
class Functions(object):
"""This is a class for Python functions"""
#staticmethod
def Clean(string):
temp = str(string)
temp = temp.replace("'", "").replace("(", "").replace(")", "").replace(",", "").strip()
return temp
#staticmethod
def ParseTweet(string):
for x in range(0, len(string)):
tweetid = string[x]["id_str"]
tweetcreated = string[x]["created_at"]
tweettext = string[x]["text"]
tweetsource = string[x]["source"]
tweetsource = tweetsource
truncated = string[x]["truncated"]
inreplytostatusid = string[x]["in_reply_to_status_id"]
inreplytouserid = string[x]["in_reply_to_user_id"]
inreplytoscreenname = string[x]["in_reply_to_screen_name"]
geo = string[x]["geo"]
coordinates = string[x]["coordinates"]
place = string[x]["place"]
contributors = string[x]["contributors"]
isquotestatus = string[x]["is_quote_status"]
retweetcount = string[x]["retweet_count"]
favoritecount = string[x]["favorite_count"]
favorited = string[x]["favorited"]
retweeted = string[x]["retweeted"]
if "possibly_sensitive" in string[x]:
possiblysensitive = string[x]["possibly_sensitive"]
else:
possiblysensitive = ""
language = string[x]["lang"]
#print(possiblysensitive)
print(Functions.UnicodeFilter(tweettext))
#print(inreplytouserid)
#print("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + Functions.UnicodeFilter(tweettext) + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + str(language) + "', '" + Functions.ToSQL(tweetcreated) + "', '" + Functions.ToSQL(tweetsource) + "', " + str(possiblysensitive) + ")")
#MySQLCL.Set("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + tweettext + "', " + str(truncated) + ", " + Functions.CheckNullNum(inreplytostatusid) + ", " + Functions.CheckNullNum(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + language + "', '" + str(Functions.FormatDate(tweetcreated)) + "', '" + str(Functions.UnicodeFilter(tweetsource)) + "', " + str(possiblysensitive) + ")")
#staticmethod
def ToBool(variable):
if variable.lower() == 'true':
return True
elif variable.lower() == 'false':
return False
#staticmethod
def CheckNullNum(var):
if var == None:
return "0"
else:
return str(var)
#staticmethod
def CheckNull(var):
if var == None:
return ""
else:
return var
#staticmethod
def ToSQL(var):
temp = var
temp = temp.replace("'", "")
return str(temp)
#staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(0x2019), "")
temp = temp.replace(chr(0x003c), "(lessthan)")
temp = temp.replace(chr(0x003e), "(greaterthan)")
temp = temp.replace(chr(0xd83c), "")
temp = temp.replace(chr(0xddfa), "")
temp = temp.replace(chr(0xddf8), "")
temp = temp.replace(chr(0xd83d), "")
temp = temp.replace(chr(0xdc4d), "")
temp = Functions.ToSQL(temp)
return temp
#staticmethod
def FormatDate(var):
temp = var
dt = datetime.strptime(temp, "%a %b %d %H:%M:%S %z %Y")
retdt = str(dt.year) + "-" + str(dt.month) + "-" + str(dt.day) + "T" + str(dt.hour) + ":" + str(dt.minute) + ":" + str(dt.second)
return retdt
As you can see, I've been using the function UnicodeFilter in order to try to filter out the unicode characters in hex. The function works when dealing with single unicode characters, but when encountering multiple unicode characters placed together, this method fails and gives the following error:
'charmap' codec can't encode characters in position 107-111: character maps to 'undefined'
Do any of you have any ideas about how to get past this problem?
UPDATE: I have tried Andrew Godbehere's solution and I was still running into the same issues. However, I decided to see if there were any specific characters that were causing a problem, so I decided to print the characters to the console character by character. That gave me the error as follows:
'charmap' codec can't encode character '\U0001f1fa' in position 0: character maps to 'undefined'
Upon seeing this, I added this to the UnicodeFilter function and continued testing. I have run into multiple errors of the same kind while printing the tweets character by character. However, I don't want to have to keep making these exceptions. For example, see the revised UnicodeFilter function:
#staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.encode(errors='ignore').decode('utf-8')
temp = temp.replace(chr(0x2019), "")
temp = temp.replace(chr(0x003c), "(lessthan)")
temp = temp.replace(chr(0x003e), "(greaterthan)")
temp = temp.replace(chr(0xd83c), "")
temp = temp.replace(chr(0xddfa), "")
temp = temp.replace(chr(0xddf8), "")
temp = temp.replace(chr(0xd83d), "")
temp = temp.replace(chr(0xdc4d), "")
temp = temp.replace(chr(0x2026), "")
temp = temp.replace(u"\U0001F1FA", "")
temp = temp.replace(u"\U0001F1F8", "")
temp = temp.replace(u"\U0001F44D", "")
temp = temp.replace(u"\U00014F18", "")
temp = temp.replace(u"\U0001F418", "")
temp = temp.replace(u"\U0001F918", "")
temp = temp.replace(u"\U0001F3FD", "")
temp = temp.replace(u"\U0001F195", "")
temp = Functions.ToSQL(temp)
return str(temp)
I don't want to have to add a new line for every character that causes a problem. Through this method, I have been able to pass multiple tweets, but this issue resurfaces with every tweet that contains different symbols. Is there not a solution that will filter out all these characters? Is it possible to filter out all characters not in the utf-8 character set?

Try the built-in unicode encode/decode error handling functionality: str.encode(errors='ignore')
For example:
problem_string = """\
RT #John_Hunt07: Just voted for #marcorubio is Florida! I am ready for a New American Century!! #FLPrimary \ud83c\uddfa\ud83c\uddf8\ud83c\uddfa\ud83c\uddf8\ud83d\udc4d
"""
print(problem_string.encode(errors='ignore').decode('utf-8'))
Ignoring errors removes problematic characters.
> RT #John_Hunt07: Just voted for #marcorubio is Florida! I am ready for a New American Century!! #FLPrimary
Other error handling options may be of interest.
xmlcharrefreplace for instance would yield:
> RT #John_Hunt07: Just voted for #marcorubio is Florida! I am ready for a New American Century!! #FLPrimary πŸ‡ΊπŸ‡ΈπŸ‡ΊπŸ‡ΈπŸ‘
If you require custom filtering as implied by your UnicodeFilter function, see Python documentation on registering an error handler.

Python provides a useful stacktrace so you can tell where errors are coming from.
Using it, you will have found that your print is causing the exception.
print() is failing because you're running Python from the Windows console, which, by default only, supports your local 8bit charmap. You can add support with: https://github.com/Drekin/win-unicode-console
You can also just write your data straight to a text file. Open the file with:
open('output.txt', 'w', encoding='utf-8')

Found the answer. The issue was that there was a range of characters in the tweets that were causing problems. Once I found the correct Unicode range for the characters, I implemented the for loop to replace any occurrence of any Unicode character within that range. After implementing that, I was able to pull thousands of tweets without any formatting or MySQL errors at all.
#staticmethod
def UnicodeFilter(var):
temp = var
temp = temp.replace(chr(0x2019), "'")
temp = temp.replace(chr(0x2026), "")
for x in range(127381, 129305):
temp = temp.replace(chr(x), "")
temp = MySQLCL.ToSQL(temp)
return str(temp)

Related

embed list in a string and run it with exec on python

I was Trying to Make an python line as String and run it using exec(),
that string was supposed to invoke a module and pass parameters ,here the parameter is a list and when i inject in string and call exec on it ..
i'm getting - TypeError: can only concatenate str (not "list") to str
my code :
def skill_manager(skill_name,parameter_list):
skill_name = skill_name[0]
print(parameter_list)
importline = "import skill."+skill_name+" as skill"
exec(importline)
if not skill_name == None:
runline = "skill." + skill_name + "(" +"'"+ parameter_list + "'" + ")"
print(runline)
exec(runline)
error :
..... line 40, in skill_manager
runline = "skill." + skill_name + "(" +"'"+ parameter_list + "'" + ")"
TypeError: can only concatenate str (not "list") to str
code
error
i tried converting list into string and otherside of code back to list .. but it has to be done in all skills ..
i somehow need to pass list from the skill_manager()

Can't print a number despite using str/int

I'm a noob trying to learn python 3 and I'm trying to include the half_age as a string without using directly writing the number 9 as a string but I couldn't figure it out.
I've tried:
print = str(18//2)
print = int(18//2)
print = float(18/2)
my_age = 18
half_age = (18//2)
name = "Kenny!"
greeting = "Kia Ora, "
print(greeting + name)
print("Your age is " + my_age + "and half your age is " + str(half_age ))
print("Your age is " + my_age + "and half your age is " + str(half_age ))
TypeError: can only concatenate str (not "int") to str
Try formatting all of your numbers with str ie.
my_age = 18
half_age = (18//2)
name = "Kenny!"
greeting = "Kia Ora, "
print(greeting + name)
print("Your age is " + str(my_age) + " and half your age is " + str(half_age))
Just use modern f-strings:
my_age = 18
half_age = (18//2)
name = "Kenny"
greeting = "Kia Ora"
print(f'{greeting}, {name}!')
print(f"Your age is {my_age} and half your age is {half_age}")
or
print(f"Your age is {my_age} and half your age is {my_age/2}")

rule identifier failed predicate

recently, i tried to do some pig script with groovy, here is my code
def appID = ['pub000000', 'pub000004', 'pub000001', 'pub000004'] as Object[]
before :appInfo = new Object[4]
now: **def appInfo = ['info1','info2','info3','info4']**
for (int i = 0; i < appInfo.size(); i++) {
//Load all the related appInfo tables
pigServer.registerQuery("${appInfo[i]} = LOAD'hbase://Information.${appID[i]}' " +
"USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('meta:number1 " +
"meta:number2') " +
"AS (number1:chararray, number2:chararray);")
}
pigServer.registerQuery("totalAppinfo = UNION ${appInfo[0]},${appInfo[1]},${appInfo[2]},${appInfo[3]};")
I worked it out finally, just to give the array the value.
Complete guess (never used piglatin), but does this work?
def appID = ['pub000000', 'pub000004', 'pub000001', 'pub000004']
appID.each { id ->
pigServer.registerQuery( "${id} = LOAD'hbase://Information.${id}' " +
"USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('meta:number1 " +
"meta:number2') " +
"AS (number1:chararray, number2:chararray);")
}
pigServer.registerQuery("totalAppinfo = UNION ${appId.join(',')};")
Edit after update to question:
def appID = ['pub000000', 'pub000004', 'pub000001', 'pub000004']
def appInfo = ['info1','info2','info3','info4']
[appInfo,appID].transpose().each { info, id ->
pigServer.registerQuery( "${info} = LOAD'hbase://Information.${id}' " +
"USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('meta:number1 " +
"meta:number2') " +
"AS (number1:chararray, number2:chararray);")
}
pigServer.registerQuery("totalAppinfo = UNION ${appInfo.join(',')};")

How to go to next line while using a loop to setText in JTextArea?

This is my code
for (int m=0; m < i ; m++){
ta1.setText( s[m].getName().toString() + ", " + s[m].getProgramName().toString() + ", " + s[m].getUni1() + ", " + s[m].getUni2() + ", " + s[m].getUni3() + ", " );
}
It's supposed to print a line from an array of student ( called s) into a JTextArea ( called ta1 ). the problem is that it always only prints the last student in the array.
I need to print each student in a new line. could anyone help me sort it out?
When you set text on an element, the current position in the loop will take over the last one.
Try doing this.
String s = "";
for(int m = 0, m <i; m++){
s += s[m].getName.toString() + ", " + s[m].getprogramName().toString() + "\n;
}
ta1.setText(s);
Create a string and add each entry to it then add new line to end of each entry "\n"
Then do.
ta1.setText(s);
setText overwrites whatever is the current text.
You need append instead; you also need a "\n" at the end of a line.

Why am I getting incorrect values for string length?

My professor is teaching us Scala using Horstmann's book "Scala for the impatient", and one of our homework exercises are straight from the book; Chapter 4, exercise 2.
We are expected to read in the eBook in text format, the professor has specified that the input file should be "Moby Dick", available for free from the Guttenberg project here: http://www.gutenberg.org/ebooks/2701.txt.utf-8
My code works, as far as counting instances of words. However, he has added the requirement that we must we must format the output in two two columns, with words left justified, and counts right justified. To do so, I am determining the longest word in the book so I can figure the width of the "word" column. However, the values I am getting for the length of the strings is just wrong. In fact, it tells me that all the strings are the same length. "a" is being reported as length 26, just as is "Whale", "Ishmael", etc...
Here's the code:
object Chapter4Exercise2 extends App {
//for sorting
import util.Sorting._
//grab the file
val inputFile = new java.util.Scanner(new java.io.File("moby.txt"))
//create a mutable map where key/values == word/count
val wordMap = collection.mutable.Map[String, Int]() withDefault (_ => 0)
//for formatting output (later), the longest word length is relevant
var longestWord = 0
var theWord: String = ""
//start reading each word in the input file
while (inputFile hasNext) {
//grab the next word for processing, convert it to lower case, trim spaces and punctuation
var nextWord = inputFile.next().toLowerCase().trim().filter(Character.isLetter(_))
//if it's the longest word, update both theWord and longestWord
if (nextWord.size > longestWord) longestWord = nextWord.size; theWord = nextWord; println(theWord + " " + longestWord)
//update the map value for the key with same value as nextWord
wordMap(nextWord) += 1
}
println("Longest word is " + theWord + " at " + longestWord + " Characters")
}
The output of these lines:
if (nextWord.size > longestWord) longestWord = nextWord.size; theWord = nextWord; println(theWord + " " + longestWord)
and
println("Longest word is " + theWord + " at " + longestWord + " Characters")
is way off. It's telling me that EVERY word in the input file is 26 characters long!
Here's a small sample of what's being output:
husks 26
on 26
a 26
surfbeaten 26
beach 26
and 26
then 26
diving 26
down 26
into 26
What am I missing/doing wrong?
if (nextWord.size > longestWord) longestWord = nextWord.size; theWord = nextWord; println(theWord + " " + longestWord)
You shouldn't write multiple statements on a single line like that. Let's write this out in multiple lines and properly indent it:
if (nextWord.size > longestWord)
longestWord = nextWord.size
theWord = nextWord
println(theWord + " " + longestWord)
Do you see the problem now?
Try putting { and } around your if statement alternatives.
You can avoid this kind of pitfall by formatting your code in a structured manner - always using braces around code blocks.
if (nextWord.size > longestWord)
{
longestWord = nextWord.size;
theWord = nextWord;
println(theWord + " " + longestWord);
}
Your current code is equivalent to
if (nextWord.size > longestWord)
{
longestWord = nextWord.size;
}
theWord = nextWord;
println(theWord + " " + longestWord);

Resources